diff --git a/AGENTS.md b/AGENTS.md index 0396b617..d414f1a7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -92,9 +92,9 @@ Tests/examples automatically skip if system lacks required resources. Heavy exam - **Google-style docstrings** - **Ruff** for linting/formatting - Use `...` in `@generative` function bodies -- Use `...` in `@generative` function bodies - Prefer primitives over classes - **Friendly Dependency Errors**: Wraps optional backend imports in `try/except ImportError` with a helpful message (e.g., "Please pip install mellea[hf]"). See `mellea/stdlib/session.py` for examples. +- **Backend telemetry fields**: All backends must populate `mot.usage` (dict with `prompt_tokens`, `completion_tokens`, `total_tokens`), `mot.model` (str), and `mot.provider` (str) in their `post_processing()` method. Metrics are automatically recorded by `TokenMetricsPlugin` — don't add manual `record_token_usage_metrics()` calls. ## 5. Commits & Hooks [Angular format](https://github.com/angular/angular/blob/main/CONTRIBUTING.md#commit): `feat:`, `fix:`, `docs:`, `test:`, `refactor:`, `release:` diff --git a/docs/dev/telemetry.md b/docs/dev/telemetry.md index 09fb342a..b0a52d6b 100644 --- a/docs/dev/telemetry.md +++ b/docs/dev/telemetry.md @@ -129,9 +129,8 @@ All token metrics include these attributes following Gen-AI semantic conventions | Attribute | Description | Example Values | |-----------|-------------|----------------| -| `gen_ai.system` | Backend system name | `openai`, `ollama`, `watsonx`, `litellm`, `huggingface` | +| `gen_ai.provider.name` | Backend provider name | `openai`, `ollama`, `watsonx`, `litellm`, `huggingface` | | `gen_ai.request.model` | Model identifier | `gpt-4`, `llama3.2:7b`, `granite-3.1-8b-instruct` | -| `mellea.backend` | Backend class name | `OpenAIBackend`, `OllamaBackend`, `WatsonxBackend` | #### Backend Support @@ -365,12 +364,30 @@ if is_metrics_enabled(): print("Token metrics are being collected") ``` +Access token usage data from `ModelOutputThunk`: + +```python +from mellea import start_session + +with start_session() as m: + result = m.instruct("Write a haiku about programming") + + # Access token usage (follows OpenAI API format) + if result.usage: + print(f"Prompt tokens: {result.usage['prompt_tokens']}") + print(f"Completion tokens: {result.usage['completion_tokens']}") + print(f"Total tokens: {result.usage['total_tokens']}") +``` + +The `usage` field is a dictionary with three keys: `prompt_tokens`, `completion_tokens`, and `total_tokens`. All backends populate this field consistently. + #### Performance -- **Zero overhead when disabled**: When `MELLEA_METRICS_ENABLED=false` (default), `record_token_usage_metrics()` returns immediately with no processing +- **Zero overhead when disabled**: When `MELLEA_METRICS_ENABLED=false` (default), the TokenMetricsPlugin is not registered and has no overhead - **Minimal overhead when enabled**: Counter increments are extremely fast (~nanoseconds per operation) - **Async export**: Metrics are batched and exported asynchronously (default: every 60 seconds) - **Non-blocking**: Metric recording never blocks LLM calls +- **Automatic collection**: Metrics are recorded via hooks after generation completes—no manual instrumentation needed #### Use Cases diff --git a/docs/examples/telemetry/metrics_example.py b/docs/examples/telemetry/metrics_example.py index d7bf1c54..c8630c55 100644 --- a/docs/examples/telemetry/metrics_example.py +++ b/docs/examples/telemetry/metrics_example.py @@ -101,6 +101,12 @@ def main(): ) print(f"Email: {str(email)[:100]}...") + # Token usage is available on the result from instruct() + if email.usage: + print(f" → Prompt tokens: {email.usage['prompt_tokens']}") + print(f" → Completion tokens: {email.usage['completion_tokens']}") + print(f" → Total tokens: {email.usage['total_tokens']}") + # Example 3: Multiple operations print("\n3. Multiple operations...") text = "Hello, how are you today?" diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py index c7886e7e..4efa5cbb 100644 --- a/mellea/backends/huggingface.py +++ b/mellea/backends/huggingface.py @@ -1075,21 +1075,20 @@ async def post_processing( except Exception: pass - # Record metrics if enabled - if metrics_enabled and n_prompt is not None: - from ..telemetry.backend_instrumentation import ( - get_model_id_str, - get_system_name, - ) - from ..telemetry.metrics import record_token_usage_metrics - - record_token_usage_metrics( - input_tokens=n_prompt, - output_tokens=n_completion, - model=get_model_id_str(self), - backend=self.__class__.__name__, - system=get_system_name(self), - ) + # Populate standardized usage field (convert to OpenAI format) + if n_prompt is not None and n_completion is not None: + mot.usage = { + "prompt_tokens": n_prompt, + "completion_tokens": n_completion, + "total_tokens": n_prompt + n_completion, + } + + # Populate model and provider metadata + if hasattr(self.model_id, "hf_model_name"): + mot.model = str(self.model_id.hf_model_name) # type: ignore + else: + mot.model = str(self.model_id) + mot.provider = "huggingface" # Record tracing if span exists if span is not None: diff --git a/mellea/backends/litellm.py b/mellea/backends/litellm.py index 4f5bac38..43e8c262 100644 --- a/mellea/backends/litellm.py +++ b/mellea/backends/litellm.py @@ -488,24 +488,13 @@ async def post_processing( if usage is None: usage = mot._meta.get("litellm_streaming_usage") - # Record metrics if enabled - from ..telemetry.metrics import is_metrics_enabled + # Populate standardized usage field (LiteLLM uses OpenAI format) + if usage: + mot.usage = usage - if is_metrics_enabled() and usage: - from ..telemetry.backend_instrumentation import ( - get_model_id_str, - get_system_name, - ) - from ..telemetry.metrics import record_token_usage_metrics - from .utils import get_value - - record_token_usage_metrics( - input_tokens=get_value(usage, "prompt_tokens"), - output_tokens=get_value(usage, "completion_tokens"), - model=get_model_id_str(self), - backend=self.__class__.__name__, - system=get_system_name(self), - ) + # Populate model and provider metadata + mot.model = str(self.model_id) + mot.provider = "litellm" # Record telemetry now that response is available span = mot._meta.get("_telemetry_span") diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py index afe80ec8..8cae59fa 100644 --- a/mellea/backends/ollama.py +++ b/mellea/backends/ollama.py @@ -622,23 +622,17 @@ async def post_processing( ) completion_tokens = getattr(response, "eval_count", None) if response else None - # Record metrics if enabled - from ..telemetry.metrics import is_metrics_enabled + # Populate standardized usage field (convert to OpenAI format) + if prompt_tokens is not None or completion_tokens is not None: + mot.usage = { + "prompt_tokens": prompt_tokens or 0, + "completion_tokens": completion_tokens or 0, + "total_tokens": (prompt_tokens or 0) + (completion_tokens or 0), + } - if is_metrics_enabled(): - from ..telemetry.backend_instrumentation import ( - get_model_id_str, - get_system_name, - ) - from ..telemetry.metrics import record_token_usage_metrics - - record_token_usage_metrics( - input_tokens=prompt_tokens, - output_tokens=completion_tokens, - model=get_model_id_str(self), - backend=self.__class__.__name__, - system=get_system_name(self), - ) + # Populate model and provider metadata + mot.model = str(self.model_id) + mot.provider = "ollama" # Record telemetry and close span now that response is available span = mot._meta.get("_telemetry_span") diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py index faadfc45..561fd58b 100644 --- a/mellea/backends/openai.py +++ b/mellea/backends/openai.py @@ -625,24 +625,13 @@ async def post_processing( if usage is None: usage = mot._meta.get("oai_streaming_usage") - # Record metrics if enabled - from ..telemetry.metrics import is_metrics_enabled + # Populate standardized usage field (OpenAI format already matches) + if usage: + mot.usage = usage - if is_metrics_enabled() and usage: - from ..telemetry.backend_instrumentation import ( - get_model_id_str, - get_system_name, - ) - from ..telemetry.metrics import record_token_usage_metrics - from .utils import get_value - - record_token_usage_metrics( - input_tokens=get_value(usage, "prompt_tokens"), - output_tokens=get_value(usage, "completion_tokens"), - model=get_model_id_str(self), - backend=self.__class__.__name__, - system=get_system_name(self), - ) + # Populate model and provider metadata + mot.model = str(self.model_id) + mot.provider = "openai" # Record telemetry now that response is available span = mot._meta.get("_telemetry_span") diff --git a/mellea/backends/watsonx.py b/mellea/backends/watsonx.py index d6ca943e..f6f65283 100644 --- a/mellea/backends/watsonx.py +++ b/mellea/backends/watsonx.py @@ -496,24 +496,13 @@ async def post_processing( else getattr(response, "usage", None) ) - # Record metrics if enabled - from ..telemetry.metrics import is_metrics_enabled + # Populate standardized usage field (WatsonX uses OpenAI format) + if usage: + mot.usage = usage - if is_metrics_enabled() and usage: - from ..telemetry.backend_instrumentation import ( - get_model_id_str, - get_system_name, - ) - from ..telemetry.metrics import record_token_usage_metrics - from .utils import get_value - - record_token_usage_metrics( - input_tokens=get_value(usage, "prompt_tokens"), - output_tokens=get_value(usage, "completion_tokens"), - model=get_model_id_str(self), - backend=self.__class__.__name__, - system=get_system_name(self), - ) + # Populate model and provider metadata + mot.model = str(self.model_id) + mot.provider = "watsonx" # Record tracing if span exists span = mot._meta.get("_telemetry_span") diff --git a/mellea/core/base.py b/mellea/core/base.py index 60d65029..4850fc9d 100644 --- a/mellea/core/base.py +++ b/mellea/core/base.py @@ -196,6 +196,29 @@ def __init__( # Additional fields that should be standardized across apis. self.tool_calls = tool_calls self._thinking: str | None = None + self.usage: dict[str, int] | None = None + """Usage information following OpenAI API standard. + + Core fields: 'prompt_tokens', 'completion_tokens', 'total_tokens'. + Populated by backends during post_processing. None if unavailable. + + Future: May include optional breakdown fields like 'completion_tokens_details' + and 'prompt_tokens_details' for advanced features (reasoning, audio, caching). + """ + + self.model: str | None = None + """Model identifier that generated this output. + + Examples: 'gpt-4', 'llama2:7b', 'meta-llama/Llama-2-7b-hf'. + Populated by backends. None if unavailable. + """ + + self.provider: str | None = None + """Provider that generated this output. + + Examples: 'openai', 'ollama', 'huggingface', 'watsonx'. + Populated by backends. None if unavailable. + """ # Used for tracking generation. self._context: list[Component | CBlock] | None = None @@ -233,6 +256,9 @@ def _copy_from(self, other: ModelOutputThunk) -> None: self.parsed_repr = other.parsed_repr self.tool_calls = other.tool_calls self._thinking = other._thinking + self.usage = other.usage + self.model = other.model + self.provider = other.provider self._generate_log = other._generate_log def is_computed(self) -> bool: @@ -433,6 +459,9 @@ def __copy__(self) -> ModelOutputThunk: copied._context = self._context copied._generate_log = self._generate_log copied._model_options = self._model_options + copied.usage = self.usage + copied.model = self.model + copied.provider = self.provider return copied def __deepcopy__(self, memo: dict) -> ModelOutputThunk: @@ -462,6 +491,9 @@ def __deepcopy__(self, memo: dict) -> ModelOutputThunk: ) # The items in a context should be immutable. deepcopied._generate_log = copy(self._generate_log) deepcopied._model_options = copy(self._model_options) + deepcopied.usage = deepcopy(self.usage) if self.usage else None + deepcopied.model = self.model + deepcopied.provider = self.provider return deepcopied diff --git a/mellea/telemetry/metrics.py b/mellea/telemetry/metrics.py index e4b99294..9382c10b 100644 --- a/mellea/telemetry/metrics.py +++ b/mellea/telemetry/metrics.py @@ -407,13 +407,9 @@ def _get_token_counters() -> tuple[Any, Any]: def record_token_usage_metrics( - input_tokens: int | None, - output_tokens: int | None, - model: str, - backend: str, - system: str, + input_tokens: int | None, output_tokens: int | None, model: str, provider: str ) -> None: - """Record token usage metrics following Gen-AI semantic conventions. + """Record token usage metrics following OpenTelemetry Gen-AI semantic conventions. This is a no-op when metrics are disabled, ensuring zero overhead. @@ -421,16 +417,14 @@ def record_token_usage_metrics( input_tokens: Number of input tokens (prompt tokens), or None if unavailable output_tokens: Number of output tokens (completion tokens), or None if unavailable model: Model identifier (e.g., "gpt-4", "llama2:7b") - backend: Backend class name (e.g., "OpenAIBackend", "OllamaBackend") - system: Gen-AI system name (e.g., "openai", "ollama", "watsonx") + provider: Provider name (e.g., "openai", "ollama", "watsonx") Example: record_token_usage_metrics( input_tokens=150, output_tokens=50, model="llama2:7b", - backend="OllamaBackend", - system="ollama" + provider="ollama" ) """ # Early return if metrics are disabled (zero overhead) @@ -440,12 +434,8 @@ def record_token_usage_metrics( # Get the token counters (lazily initialized) input_counter, output_counter = _get_token_counters() - # Prepare attributes following Gen-AI semantic conventions - attributes = { - "gen_ai.system": system, - "gen_ai.request.model": model, - "mellea.backend": backend, - } + # Prepare attributes following OTel Gen-AI semantic conventions + attributes = {"gen_ai.provider.name": provider, "gen_ai.request.model": model} # Record input tokens if available if input_tokens is not None and input_tokens > 0: @@ -456,6 +446,30 @@ def record_token_usage_metrics( output_counter.add(output_tokens, attributes) +# Auto-register TokenMetricsPlugin when metrics are enabled +if _OTEL_AVAILABLE and _METRICS_ENABLED: + try: + from mellea.plugins.registry import register + from mellea.telemetry.metrics_plugins import TokenMetricsPlugin + + # Idempotent registration (supports module reloads in tests) + try: + register(TokenMetricsPlugin()) + except ValueError as e: + # Already registered (expected during module reloads in tests) + warnings.warn( + f"TokenMetricsPlugin already registered: {e}", UserWarning, stacklevel=2 + ) + except ImportError: + warnings.warn( + "Metrics are enabled but the plugin framework is not installed. " + "Token usage metrics will not be recorded automatically. " + "Install with: pip install mellea[telemetry]", + UserWarning, + stacklevel=2, + ) + + __all__ = [ "create_counter", "create_histogram", diff --git a/mellea/telemetry/metrics_plugins.py b/mellea/telemetry/metrics_plugins.py new file mode 100644 index 00000000..cbf45ac8 --- /dev/null +++ b/mellea/telemetry/metrics_plugins.py @@ -0,0 +1,62 @@ +"""Metrics plugins for recording telemetry data via hooks. + +This module contains plugins that hook into the generation pipeline to +automatically record metrics when enabled. Currently includes: + +- TokenMetricsPlugin: Records token usage statistics from ModelOutputThunk.usage +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from mellea.plugins.base import Plugin +from mellea.plugins.decorators import hook +from mellea.plugins.types import PluginMode + +if TYPE_CHECKING: + from mellea.plugins.hooks.generation import GenerationPostCallPayload + + +class TokenMetricsPlugin(Plugin, name="token_metrics", priority=50): + """Records token usage metrics from generation outputs. + + This plugin hooks into the generation_post_call event to automatically + record token usage metrics when the usage field is populated on + ModelOutputThunk instances. + + The plugin reads the standardized usage field (OpenAI-compatible format) + and records metrics following OpenTelemetry Gen-AI semantic conventions. + + Example: + >>> from mellea.telemetry.metrics_plugins import TokenMetricsPlugin + >>> from mellea.telemetry.metrics import enable_metrics + >>> + >>> enable_metrics() + >>> with TokenMetricsPlugin(): + ... result = session.instruct("Hello, world!") + """ + + @hook("generation_post_call", mode=PluginMode.SEQUENTIAL) + async def record_token_metrics( + self, payload: GenerationPostCallPayload, context: dict[str, Any] + ) -> None: + """Record token metrics after generation completes. + + Args: + payload: Contains the model_output (ModelOutputThunk) with usage data + context: Plugin context (unused) + """ + from mellea.telemetry.metrics import record_token_usage_metrics + + mot = payload.model_output + if mot.usage is None: + return + + # Record metrics (no-op if metrics disabled) + record_token_usage_metrics( + input_tokens=mot.usage.get("prompt_tokens"), + output_tokens=mot.usage.get("completion_tokens"), + model=mot.model or "unknown", + provider=mot.provider or "unknown", + ) diff --git a/pyproject.toml b/pyproject.toml index d5d7bb24..fbde0025 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,6 +75,7 @@ telemetry = [ "opentelemetry-exporter-otlp>=1.20.0", "opentelemetry-exporter-prometheus>=0.40b0", "opentelemetry-distro>=0.59b0", + "mellea[hooks]", ] docling = [ diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py index 805f8c4d..e2881f3b 100644 --- a/test/backends/test_huggingface.py +++ b/test/backends/test_huggingface.py @@ -328,6 +328,14 @@ async def test_async_avalue(session) -> None: assert m1_final_val is not None assert m1_final_val == mot1.value + # Verify telemetry fields are populated + assert mot1.usage is not None + assert mot1.usage["prompt_tokens"] >= 0 + assert mot1.usage["completion_tokens"] > 0 + assert mot1.usage["total_tokens"] > 0 + assert mot1.model is not None + assert mot1.provider == "huggingface" + @pytest.mark.qualitative async def test_generate_with_lock(backend) -> None: diff --git a/test/backends/test_litellm_ollama.py b/test/backends/test_litellm_ollama.py index ece9c890..d41367bb 100644 --- a/test/backends/test_litellm_ollama.py +++ b/test/backends/test_litellm_ollama.py @@ -202,6 +202,14 @@ async def test_async_avalue(session): assert m1_final_val is not None assert m1_final_val == mot1.value + # Verify telemetry fields are populated + assert mot1.usage is not None + assert mot1.usage["prompt_tokens"] >= 0 + assert mot1.usage["completion_tokens"] > 0 + assert mot1.usage["total_tokens"] > 0 + assert mot1.model is not None + assert mot1.provider == "litellm" + if __name__ == "__main__": import pytest diff --git a/test/backends/test_ollama.py b/test/backends/test_ollama.py index 706766ea..a4b75f59 100644 --- a/test/backends/test_ollama.py +++ b/test/backends/test_ollama.py @@ -196,6 +196,14 @@ async def test_async_avalue(session) -> None: assert m1_final_val is not None assert m1_final_val == mot1.value + # Verify telemetry fields are populated + assert mot1.usage is not None + assert mot1.usage["prompt_tokens"] >= 0 + assert mot1.usage["completion_tokens"] > 0 + assert mot1.usage["total_tokens"] > 0 + assert mot1.model is not None + assert mot1.provider == "ollama" + def test_multiple_asyncio_runs(session) -> None: async def test(): diff --git a/test/backends/test_openai_ollama.py b/test/backends/test_openai_ollama.py index 142d0781..10e4ac0b 100644 --- a/test/backends/test_openai_ollama.py +++ b/test/backends/test_openai_ollama.py @@ -194,6 +194,14 @@ async def test_async_avalue(m_session) -> None: assert m1_final_val is not None assert m1_final_val == mot1.value + # Verify telemetry fields are populated + assert mot1.usage is not None + assert mot1.usage["prompt_tokens"] >= 0 + assert mot1.usage["completion_tokens"] > 0 + assert mot1.usage["total_tokens"] > 0 + assert mot1.model is not None + assert mot1.provider == "openai" + def test_client_cache(backend) -> None: first_client = backend._async_client diff --git a/test/backends/test_watsonx.py b/test/backends/test_watsonx.py index c9f27be2..87784e0b 100644 --- a/test/backends/test_watsonx.py +++ b/test/backends/test_watsonx.py @@ -207,6 +207,14 @@ async def test_async_avalue(session): assert m1_final_val is not None assert m1_final_val == mot1.value + # Verify telemetry fields are populated + assert mot1.usage is not None + assert mot1.usage["prompt_tokens"] >= 0 + assert mot1.usage["completion_tokens"] > 0 + assert mot1.usage["total_tokens"] > 0 + assert mot1.model is not None + assert mot1.provider == "watsonx" + def test_client_cache(backend): first_client = backend._model diff --git a/test/telemetry/test_metrics.py b/test/telemetry/test_metrics.py index 81143800..f07ea61e 100644 --- a/test/telemetry/test_metrics.py +++ b/test/telemetry/test_metrics.py @@ -569,11 +569,7 @@ def test_token_counters_lazy_initialization(enable_metrics): from mellea.telemetry.metrics import record_token_usage_metrics record_token_usage_metrics( - input_tokens=100, - output_tokens=50, - model="llama2:7b", - backend="OllamaBackend", - system="ollama", + input_tokens=100, output_tokens=50, model="llama2:7b", provider="ollama" ) # Now should be initialized @@ -589,11 +585,7 @@ def test_record_token_usage_metrics_with_valid_tokens(enable_metrics): # Should not raise record_token_usage_metrics( - input_tokens=150, - output_tokens=50, - model="gpt-4", - backend="OpenAIBackend", - system="openai", + input_tokens=150, output_tokens=50, model="gpt-4", provider="openai" ) @@ -603,11 +595,7 @@ def test_record_token_usage_metrics_with_none_tokens(enable_metrics): # Should not raise record_token_usage_metrics( - input_tokens=None, - output_tokens=None, - model="llama2:7b", - backend="OllamaBackend", - system="ollama", + input_tokens=None, output_tokens=None, model="llama2:7b", provider="ollama" ) @@ -617,11 +605,7 @@ def test_record_token_usage_metrics_with_zero_tokens(enable_metrics): # Should not raise, but won't record zeros record_token_usage_metrics( - input_tokens=0, - output_tokens=0, - model="llama2:7b", - backend="OllamaBackend", - system="ollama", + input_tokens=0, output_tokens=0, model="llama2:7b", provider="ollama" ) @@ -631,11 +615,7 @@ def test_record_token_usage_metrics_noop_when_disabled(clean_metrics_env): # Should not raise and should be no-op record_token_usage_metrics( - input_tokens=100, - output_tokens=50, - model="llama2:7b", - backend="OllamaBackend", - system="ollama", + input_tokens=100, output_tokens=50, model="llama2:7b", provider="ollama" ) # Counters should still be None (not initialized) diff --git a/test/telemetry/test_metrics_backend.py b/test/telemetry/test_metrics_backend.py index 5b2702bd..8918564e 100644 --- a/test/telemetry/test_metrics_backend.py +++ b/test/telemetry/test_metrics_backend.py @@ -142,12 +142,12 @@ async def test_ollama_token_metrics_integration(enable_metrics, metric_reader, s # Verify input token counter input_tokens = get_metric_value( - metrics_data, "mellea.llm.tokens.input", {"gen_ai.system": "ollama"} + metrics_data, "mellea.llm.tokens.input", {"gen_ai.provider.name": "ollama"} ) # Verify output token counter output_tokens = get_metric_value( - metrics_data, "mellea.llm.tokens.output", {"gen_ai.system": "ollama"} + metrics_data, "mellea.llm.tokens.output", {"gen_ai.provider.name": "ollama"} ) # Ollama should always return token counts @@ -198,11 +198,11 @@ async def test_openai_token_metrics_integration(enable_metrics, metric_reader, s # OpenAI always provides token counts input_tokens = get_metric_value( - metrics_data, "mellea.llm.tokens.input", {"gen_ai.system": "openai"} + metrics_data, "mellea.llm.tokens.input", {"gen_ai.provider.name": "openai"} ) output_tokens = get_metric_value( - metrics_data, "mellea.llm.tokens.output", {"gen_ai.system": "openai"} + metrics_data, "mellea.llm.tokens.output", {"gen_ai.provider.name": "openai"} ) assert input_tokens is not None, "Input tokens should be recorded" @@ -246,11 +246,11 @@ async def test_watsonx_token_metrics_integration(enable_metrics, metric_reader): metrics_data = metric_reader.get_metrics_data() input_tokens = get_metric_value( - metrics_data, "mellea.llm.tokens.input", {"gen_ai.system": "watsonx"} + metrics_data, "mellea.llm.tokens.input", {"gen_ai.provider.name": "watsonx"} ) output_tokens = get_metric_value( - metrics_data, "mellea.llm.tokens.output", {"gen_ai.system": "watsonx"} + metrics_data, "mellea.llm.tokens.output", {"gen_ai.provider.name": "watsonx"} ) assert input_tokens is not None, "Input tokens should be recorded" @@ -306,11 +306,11 @@ async def test_litellm_token_metrics_integration( metrics_data = metric_reader.get_metrics_data() input_tokens = get_metric_value( - metrics_data, "mellea.llm.tokens.input", {"gen_ai.system": "litellm"} + metrics_data, "mellea.llm.tokens.input", {"gen_ai.provider.name": "litellm"} ) output_tokens = get_metric_value( - metrics_data, "mellea.llm.tokens.output", {"gen_ai.system": "litellm"} + metrics_data, "mellea.llm.tokens.output", {"gen_ai.provider.name": "litellm"} ) # LiteLLM with Ollama backend should always provide token counts @@ -357,11 +357,13 @@ async def test_huggingface_token_metrics_integration( # HuggingFace computes token counts locally input_tokens = get_metric_value( - metrics_data, "mellea.llm.tokens.input", {"gen_ai.system": "huggingface"} + metrics_data, "mellea.llm.tokens.input", {"gen_ai.provider.name": "huggingface"} ) output_tokens = get_metric_value( - metrics_data, "mellea.llm.tokens.output", {"gen_ai.system": "huggingface"} + metrics_data, + "mellea.llm.tokens.output", + {"gen_ai.provider.name": "huggingface"}, ) assert input_tokens is not None, "Input tokens should be recorded" diff --git a/test/telemetry/test_metrics_token.py b/test/telemetry/test_metrics_token.py index a28a9460..2a61acce 100644 --- a/test/telemetry/test_metrics_token.py +++ b/test/telemetry/test_metrics_token.py @@ -58,11 +58,7 @@ def test_record_token_metrics_basic(clean_metrics_env): # Record some token usage record_token_usage_metrics( - input_tokens=150, - output_tokens=50, - model="llama2:7b", - backend="OllamaBackend", - system="ollama", + input_tokens=150, output_tokens=50, model="llama2:7b", provider="ollama" ) # Force metrics collection @@ -86,9 +82,8 @@ def test_record_token_metrics_basic(clean_metrics_env): # Verify attributes for data_point in metric.data.data_points: attrs = dict(data_point.attributes) - assert attrs["gen_ai.system"] == "ollama" + assert attrs["gen_ai.provider.name"] == "ollama" assert attrs["gen_ai.request.model"] == "llama2:7b" - assert attrs["mellea.backend"] == "OllamaBackend" assert data_point.value == 150 if metric.name == "mellea.llm.tokens.output": @@ -96,9 +91,8 @@ def test_record_token_metrics_basic(clean_metrics_env): # Verify attributes for data_point in metric.data.data_points: attrs = dict(data_point.attributes) - assert attrs["gen_ai.system"] == "ollama" + assert attrs["gen_ai.provider.name"] == "ollama" assert attrs["gen_ai.request.model"] == "llama2:7b" - assert attrs["mellea.backend"] == "OllamaBackend" assert data_point.value == 50 assert found_input, "Input token metric not found" @@ -120,18 +114,10 @@ def test_record_token_metrics_accumulation(clean_metrics_env): # Record multiple token usages with same attributes record_token_usage_metrics( - input_tokens=100, - output_tokens=30, - model="gpt-4", - backend="OpenAIBackend", - system="openai", + input_tokens=100, output_tokens=30, model="gpt-4", provider="openai" ) record_token_usage_metrics( - input_tokens=200, - output_tokens=70, - model="gpt-4", - backend="OpenAIBackend", - system="openai", + input_tokens=200, output_tokens=70, model="gpt-4", provider="openai" ) # Force metrics collection @@ -166,11 +152,7 @@ def test_record_token_metrics_none_handling(clean_metrics_env): # Record with None values (should not crash) record_token_usage_metrics( - input_tokens=None, - output_tokens=None, - model="llama2:7b", - backend="OllamaBackend", - system="ollama", + input_tokens=None, output_tokens=None, model="llama2:7b", provider="ollama" ) # Should not raise, and no metrics should be recorded for None values @@ -203,25 +185,13 @@ def test_record_token_metrics_multiple_backends(clean_metrics_env): # Record from different backends record_token_usage_metrics( - input_tokens=100, - output_tokens=50, - model="llama2:7b", - backend="OllamaBackend", - system="ollama", + input_tokens=100, output_tokens=50, model="llama2:7b", provider="ollama" ) record_token_usage_metrics( - input_tokens=200, - output_tokens=80, - model="gpt-4", - backend="OpenAIBackend", - system="openai", + input_tokens=200, output_tokens=80, model="gpt-4", provider="openai" ) record_token_usage_metrics( - input_tokens=150, - output_tokens=60, - model="granite-3-8b", - backend="WatsonxBackend", - system="watsonx", + input_tokens=150, output_tokens=60, model="granite-3-8b", provider="watsonx" ) # Force metrics collection @@ -240,18 +210,16 @@ def test_record_token_metrics_multiple_backends(clean_metrics_env): for dp in metric.data.data_points: attrs = dict(dp.attributes) key = ( - attrs["gen_ai.system"], + attrs["gen_ai.provider.name"], attrs["gen_ai.request.model"], - attrs["mellea.backend"], ) input_attrs.add(key) if metric.name == "mellea.llm.tokens.output": for dp in metric.data.data_points: attrs = dict(dp.attributes) key = ( - attrs["gen_ai.system"], + attrs["gen_ai.provider.name"], attrs["gen_ai.request.model"], - attrs["mellea.backend"], ) output_attrs.add(key) diff --git a/uv.lock b/uv.lock index e64504fb..f64846ff 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.11" resolution-markers = [ "python_full_version >= '3.14' and python_full_version < '4'", @@ -3747,6 +3747,8 @@ server = [ { name = "uvicorn" }, ] telemetry = [ + { name = "cpex" }, + { name = "grpcio" }, { name = "opentelemetry-api" }, { name = "opentelemetry-distro" }, { name = "opentelemetry-exporter-otlp" }, @@ -3854,6 +3856,7 @@ requires-dist = [ { name = "llm-sandbox", extras = ["docker"], marker = "extra == 'sandbox'", specifier = ">=0.3.23" }, { name = "math-verify" }, { name = "mellea", extras = ["backends", "docling", "tools", "telemetry", "server", "sandbox", "granite-retriever", "hooks"], marker = "extra == 'all'" }, + { name = "mellea", extras = ["hooks"], marker = "extra == 'telemetry'" }, { name = "mellea", extras = ["watsonx", "hf", "vllm", "litellm"], marker = "extra == 'backends'" }, { name = "mistletoe", specifier = ">=1.4.0" }, { name = "numpy", marker = "extra == 'vllm'", specifier = "<=2.2" }, @@ -5813,11 +5816,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/75/2e/a9e28941c6dab6f06e6d3f6783d3373044be9b0f9a9d3492c3d8d2260ac0/pybase64-1.4.3-cp312-cp312-win32.whl", hash = "sha256:7bca1ed3a5df53305c629ca94276966272eda33c0d71f862d2d3d043f1e1b91a", size = 33686, upload-time = "2025-12-06T13:23:37.848Z" }, { url = "https://files.pythonhosted.org/packages/83/e3/507ab649d8c3512c258819c51d25c45d6e29d9ca33992593059e7b646a33/pybase64-1.4.3-cp312-cp312-win_amd64.whl", hash = "sha256:9f2da8f56d9b891b18b4daf463a0640eae45a80af548ce435be86aa6eff3603b", size = 35833, upload-time = "2025-12-06T13:23:38.877Z" }, { url = "https://files.pythonhosted.org/packages/bc/8a/6eba66cd549a2fc74bb4425fd61b839ba0ab3022d3c401b8a8dc2cc00c7a/pybase64-1.4.3-cp312-cp312-win_arm64.whl", hash = "sha256:0631d8a2d035de03aa9bded029b9513e1fee8ed80b7ddef6b8e9389ffc445da0", size = 31185, upload-time = "2025-12-06T13:23:39.908Z" }, - { url = "https://files.pythonhosted.org/packages/3a/50/b7170cb2c631944388fe2519507fe3835a4054a6a12a43f43781dae82be1/pybase64-1.4.3-cp313-cp313-android_21_arm64_v8a.whl", hash = "sha256:ea4b785b0607d11950b66ce7c328f452614aefc9c6d3c9c28bae795dc7f072e1", size = 33901, upload-time = "2025-12-06T13:23:40.951Z" }, { url = "https://files.pythonhosted.org/packages/48/8b/69f50578e49c25e0a26e3ee72c39884ff56363344b79fc3967f5af420ed6/pybase64-1.4.3-cp313-cp313-android_21_x86_64.whl", hash = "sha256:6a10b6330188c3026a8b9c10e6b9b3f2e445779cf16a4c453d51a072241c65a2", size = 40807, upload-time = "2025-12-06T13:23:42.006Z" }, - { url = "https://files.pythonhosted.org/packages/5c/8d/20b68f11adfc4c22230e034b65c71392e3e338b413bf713c8945bd2ccfb3/pybase64-1.4.3-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:27fdff227a0c0e182e0ba37a99109645188978b920dfb20d8b9c17eeee370d0d", size = 30932, upload-time = "2025-12-06T13:23:43.348Z" }, - { url = "https://files.pythonhosted.org/packages/f7/79/b1b550ac6bff51a4880bf6e089008b2e1ca16f2c98db5e039a08ac3ad157/pybase64-1.4.3-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:2a8204f1fdfec5aa4184249b51296c0de95445869920c88123978304aad42df1", size = 31394, upload-time = "2025-12-06T13:23:44.317Z" }, - { url = "https://files.pythonhosted.org/packages/82/70/b5d7c5932bf64ee1ec5da859fbac981930b6a55d432a603986c7f509c838/pybase64-1.4.3-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:874fc2a3777de6baf6aa921a7aa73b3be98295794bea31bd80568a963be30767", size = 38078, upload-time = "2025-12-06T13:23:45.348Z" }, { url = "https://files.pythonhosted.org/packages/1c/c9/24b3b905cf75e23a9a4deaf203b35ffcb9f473ac0e6d8257f91a05dfce62/pybase64-1.4.3-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:1d45c8fe8fe82b65c36b227bb4a2cf623d9ada16bed602ce2d3e18c35285b72a", size = 68244, upload-time = "2025-12-06T13:23:49.026Z" }, { url = "https://files.pythonhosted.org/packages/f8/cd/d15b0c3e25e5859fab0416dc5b96d34d6bd2603c1c96a07bb2202b68ab92/pybase64-1.4.3-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:ad70c26ba091d8f5167e9d4e1e86a0483a5414805cdb598a813db635bd3be8b8", size = 71620, upload-time = "2025-12-06T13:23:50.081Z" }, { url = "https://files.pythonhosted.org/packages/0d/31/4ca953cc3dcde2b3711d6bfd70a6f4ad2ca95a483c9698076ba605f1520f/pybase64-1.4.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e98310b7c43145221e7194ac9fa7fffc84763c87bfc5e2f59f9f92363475bdc1", size = 59930, upload-time = "2025-12-06T13:23:51.68Z" }, @@ -5852,11 +5851,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/42/10/abb7757c330bb869ebb95dab0c57edf5961ffbd6c095c8209cbbf75d117d/pybase64-1.4.3-cp313-cp313t-win32.whl", hash = "sha256:46d75c9387f354c5172582a9eaae153b53a53afeb9c19fcf764ea7038be3bd8b", size = 33965, upload-time = "2025-12-06T13:24:28.548Z" }, { url = "https://files.pythonhosted.org/packages/63/a0/2d4e5a59188e9e6aed0903d580541aaea72dcbbab7bf50fb8b83b490b6c3/pybase64-1.4.3-cp313-cp313t-win_amd64.whl", hash = "sha256:d7344625591d281bec54e85cbfdab9e970f6219cac1570f2aa140b8c942ccb81", size = 36207, upload-time = "2025-12-06T13:24:29.646Z" }, { url = "https://files.pythonhosted.org/packages/1f/05/95b902e8f567b4d4b41df768ccc438af618f8d111e54deaf57d2df46bd76/pybase64-1.4.3-cp313-cp313t-win_arm64.whl", hash = "sha256:28a3c60c55138e0028313f2eccd321fec3c4a0be75e57a8d3eb883730b1b0880", size = 31505, upload-time = "2025-12-06T13:24:30.687Z" }, - { url = "https://files.pythonhosted.org/packages/e4/80/4bd3dff423e5a91f667ca41982dc0b79495b90ec0c0f5d59aca513e50f8c/pybase64-1.4.3-cp314-cp314-android_24_arm64_v8a.whl", hash = "sha256:015bb586a1ea1467f69d57427abe587469392215f59db14f1f5c39b52fdafaf5", size = 33835, upload-time = "2025-12-06T13:24:31.767Z" }, { url = "https://files.pythonhosted.org/packages/45/60/a94d94cc1e3057f602e0b483c9ebdaef40911d84a232647a2fe593ab77bb/pybase64-1.4.3-cp314-cp314-android_24_x86_64.whl", hash = "sha256:d101e3a516f837c3dcc0e5a0b7db09582ebf99ed670865223123fb2e5839c6c0", size = 40673, upload-time = "2025-12-06T13:24:32.82Z" }, - { url = "https://files.pythonhosted.org/packages/e3/71/cf62b261d431857e8e054537a5c3c24caafa331de30daede7b2c6c558501/pybase64-1.4.3-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:8f183ac925a48046abe047360fe3a1b28327afb35309892132fe1915d62fb282", size = 30939, upload-time = "2025-12-06T13:24:34.001Z" }, - { url = "https://files.pythonhosted.org/packages/24/3e/d12f92a3c1f7c6ab5d53c155bff9f1084ba997a37a39a4f781ccba9455f3/pybase64-1.4.3-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:30bf3558e24dcce4da5248dcf6d73792adfcf4f504246967e9db155be4c439ad", size = 31401, upload-time = "2025-12-06T13:24:35.11Z" }, - { url = "https://files.pythonhosted.org/packages/9b/3d/9c27440031fea0d05146f8b70a460feb95d8b4e3d9ca8f45c972efb4c3d3/pybase64-1.4.3-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:a674b419de318d2ce54387dd62646731efa32b4b590907800f0bd40675c1771d", size = 38075, upload-time = "2025-12-06T13:24:36.53Z" }, { url = "https://files.pythonhosted.org/packages/db/26/b136a4b65e5c94ff06217f7726478df3f31ab1c777c2c02cf698e748183f/pybase64-1.4.3-cp314-cp314-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:b51204d349a4b208287a8aa5b5422be3baa88abf6cc8ff97ccbda34919bbc857", size = 68460, upload-time = "2025-12-06T13:24:41.735Z" }, { url = "https://files.pythonhosted.org/packages/68/6d/84ce50e7ee1ae79984d689e05a9937b2460d4efa1e5b202b46762fb9036c/pybase64-1.4.3-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:30f2fd53efecbdde4bdca73a872a68dcb0d1bf8a4560c70a3e7746df973e1ef3", size = 71688, upload-time = "2025-12-06T13:24:42.908Z" }, { url = "https://files.pythonhosted.org/packages/e3/57/6743e420416c3ff1b004041c85eb0ebd9c50e9cf05624664bfa1dc8b5625/pybase64-1.4.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0932b0c5cfa617091fd74f17d24549ce5de3628791998c94ba57be808078eeaf", size = 60040, upload-time = "2025-12-06T13:24:44.37Z" },