Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,9 @@ Tests/examples automatically skip if system lacks required resources. Heavy exam
- **Google-style docstrings**
- **Ruff** for linting/formatting
- Use `...` in `@generative` function bodies
- Use `...` in `@generative` function bodies
- Prefer primitives over classes
- **Friendly Dependency Errors**: Wraps optional backend imports in `try/except ImportError` with a helpful message (e.g., "Please pip install mellea[hf]"). See `mellea/stdlib/session.py` for examples.
- **Backend telemetry fields**: All backends must populate `mot.usage` (dict with `prompt_tokens`, `completion_tokens`, `total_tokens`), `mot.model` (str), and `mot.provider` (str) in their `post_processing()` method. Metrics are automatically recorded by `TokenMetricsPlugin` — don't add manual `record_token_usage_metrics()` calls.

## 5. Commits & Hooks
[Angular format](https://github.com/angular/angular/blob/main/CONTRIBUTING.md#commit): `feat:`, `fix:`, `docs:`, `test:`, `refactor:`, `release:`
Expand Down
23 changes: 20 additions & 3 deletions docs/dev/telemetry.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,8 @@ All token metrics include these attributes following Gen-AI semantic conventions

| Attribute | Description | Example Values |
|-----------|-------------|----------------|
| `gen_ai.system` | Backend system name | `openai`, `ollama`, `watsonx`, `litellm`, `huggingface` |
| `gen_ai.provider.name` | Backend provider name | `openai`, `ollama`, `watsonx`, `litellm`, `huggingface` |
| `gen_ai.request.model` | Model identifier | `gpt-4`, `llama3.2:7b`, `granite-3.1-8b-instruct` |
| `mellea.backend` | Backend class name | `OpenAIBackend`, `OllamaBackend`, `WatsonxBackend` |

#### Backend Support

Expand Down Expand Up @@ -365,12 +364,30 @@ if is_metrics_enabled():
print("Token metrics are being collected")
```

Access token usage data from `ModelOutputThunk`:

```python
from mellea import start_session

with start_session() as m:
result = m.instruct("Write a haiku about programming")

# Access token usage (follows OpenAI API format)
if result.usage:
print(f"Prompt tokens: {result.usage['prompt_tokens']}")
print(f"Completion tokens: {result.usage['completion_tokens']}")
print(f"Total tokens: {result.usage['total_tokens']}")
```

The `usage` field is a dictionary with three keys: `prompt_tokens`, `completion_tokens`, and `total_tokens`. All backends populate this field consistently.

#### Performance

- **Zero overhead when disabled**: When `MELLEA_METRICS_ENABLED=false` (default), `record_token_usage_metrics()` returns immediately with no processing
- **Zero overhead when disabled**: When `MELLEA_METRICS_ENABLED=false` (default), the TokenMetricsPlugin is not registered and has no overhead
- **Minimal overhead when enabled**: Counter increments are extremely fast (~nanoseconds per operation)
- **Async export**: Metrics are batched and exported asynchronously (default: every 60 seconds)
- **Non-blocking**: Metric recording never blocks LLM calls
- **Automatic collection**: Metrics are recorded via hooks after generation completes—no manual instrumentation needed

#### Use Cases

Expand Down
6 changes: 6 additions & 0 deletions docs/examples/telemetry/metrics_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,12 @@ def main():
)
print(f"Email: {str(email)[:100]}...")

# Token usage is available on the result from instruct()
if email.usage:
print(f" → Prompt tokens: {email.usage['prompt_tokens']}")
print(f" → Completion tokens: {email.usage['completion_tokens']}")
print(f" → Total tokens: {email.usage['total_tokens']}")

# Example 3: Multiple operations
print("\n3. Multiple operations...")
text = "Hello, how are you today?"
Expand Down
29 changes: 14 additions & 15 deletions mellea/backends/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -1075,21 +1075,20 @@ async def post_processing(
except Exception:
pass

# Record metrics if enabled
if metrics_enabled and n_prompt is not None:
from ..telemetry.backend_instrumentation import (
get_model_id_str,
get_system_name,
)
from ..telemetry.metrics import record_token_usage_metrics

record_token_usage_metrics(
input_tokens=n_prompt,
output_tokens=n_completion,
model=get_model_id_str(self),
backend=self.__class__.__name__,
system=get_system_name(self),
)
# Populate standardized usage field (convert to OpenAI format)
if n_prompt is not None and n_completion is not None:
mot.usage = {
"prompt_tokens": n_prompt,
"completion_tokens": n_completion,
"total_tokens": n_prompt + n_completion,
}

# Populate model and provider metadata
if hasattr(self.model_id, "hf_model_name"):
mot.model = str(self.model_id.hf_model_name) # type: ignore
else:
mot.model = str(self.model_id)
mot.provider = "huggingface"

# Record tracing if span exists
if span is not None:
Expand Down
23 changes: 6 additions & 17 deletions mellea/backends/litellm.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,24 +488,13 @@ async def post_processing(
if usage is None:
usage = mot._meta.get("litellm_streaming_usage")

# Record metrics if enabled
from ..telemetry.metrics import is_metrics_enabled
# Populate standardized usage field (LiteLLM uses OpenAI format)
if usage:
mot.usage = usage

if is_metrics_enabled() and usage:
from ..telemetry.backend_instrumentation import (
get_model_id_str,
get_system_name,
)
from ..telemetry.metrics import record_token_usage_metrics
from .utils import get_value

record_token_usage_metrics(
input_tokens=get_value(usage, "prompt_tokens"),
output_tokens=get_value(usage, "completion_tokens"),
model=get_model_id_str(self),
backend=self.__class__.__name__,
system=get_system_name(self),
)
# Populate model and provider metadata
mot.model = str(self.model_id)
mot.provider = "litellm"

# Record telemetry now that response is available
span = mot._meta.get("_telemetry_span")
Expand Down
26 changes: 10 additions & 16 deletions mellea/backends/ollama.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,23 +622,17 @@ async def post_processing(
)
completion_tokens = getattr(response, "eval_count", None) if response else None

# Record metrics if enabled
from ..telemetry.metrics import is_metrics_enabled
# Populate standardized usage field (convert to OpenAI format)
if prompt_tokens is not None or completion_tokens is not None:
mot.usage = {
"prompt_tokens": prompt_tokens or 0,
"completion_tokens": completion_tokens or 0,
"total_tokens": (prompt_tokens or 0) + (completion_tokens or 0),
}

if is_metrics_enabled():
from ..telemetry.backend_instrumentation import (
get_model_id_str,
get_system_name,
)
from ..telemetry.metrics import record_token_usage_metrics

record_token_usage_metrics(
input_tokens=prompt_tokens,
output_tokens=completion_tokens,
model=get_model_id_str(self),
backend=self.__class__.__name__,
system=get_system_name(self),
)
# Populate model and provider metadata
mot.model = str(self.model_id)
mot.provider = "ollama"

# Record telemetry and close span now that response is available
span = mot._meta.get("_telemetry_span")
Expand Down
23 changes: 6 additions & 17 deletions mellea/backends/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,24 +625,13 @@ async def post_processing(
if usage is None:
usage = mot._meta.get("oai_streaming_usage")

# Record metrics if enabled
from ..telemetry.metrics import is_metrics_enabled
# Populate standardized usage field (OpenAI format already matches)
if usage:
mot.usage = usage

if is_metrics_enabled() and usage:
from ..telemetry.backend_instrumentation import (
get_model_id_str,
get_system_name,
)
from ..telemetry.metrics import record_token_usage_metrics
from .utils import get_value

record_token_usage_metrics(
input_tokens=get_value(usage, "prompt_tokens"),
output_tokens=get_value(usage, "completion_tokens"),
model=get_model_id_str(self),
backend=self.__class__.__name__,
system=get_system_name(self),
)
# Populate model and provider metadata
mot.model = str(self.model_id)
mot.provider = "openai"

# Record telemetry now that response is available
span = mot._meta.get("_telemetry_span")
Expand Down
23 changes: 6 additions & 17 deletions mellea/backends/watsonx.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,24 +496,13 @@ async def post_processing(
else getattr(response, "usage", None)
)

# Record metrics if enabled
from ..telemetry.metrics import is_metrics_enabled
# Populate standardized usage field (WatsonX uses OpenAI format)
if usage:
mot.usage = usage

if is_metrics_enabled() and usage:
from ..telemetry.backend_instrumentation import (
get_model_id_str,
get_system_name,
)
from ..telemetry.metrics import record_token_usage_metrics
from .utils import get_value

record_token_usage_metrics(
input_tokens=get_value(usage, "prompt_tokens"),
output_tokens=get_value(usage, "completion_tokens"),
model=get_model_id_str(self),
backend=self.__class__.__name__,
system=get_system_name(self),
)
# Populate model and provider metadata
mot.model = str(self.model_id)
mot.provider = "watsonx"

# Record tracing if span exists
span = mot._meta.get("_telemetry_span")
Expand Down
32 changes: 32 additions & 0 deletions mellea/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,29 @@ def __init__(
# Additional fields that should be standardized across apis.
self.tool_calls = tool_calls
self._thinking: str | None = None
self.usage: dict[str, int] | None = None
"""Usage information following OpenAI API standard.

Core fields: 'prompt_tokens', 'completion_tokens', 'total_tokens'.
Populated by backends during post_processing. None if unavailable.

Future: May include optional breakdown fields like 'completion_tokens_details'
and 'prompt_tokens_details' for advanced features (reasoning, audio, caching).
"""

self.model: str | None = None
"""Model identifier that generated this output.

Examples: 'gpt-4', 'llama2:7b', 'meta-llama/Llama-2-7b-hf'.
Populated by backends. None if unavailable.
"""

self.provider: str | None = None
"""Provider that generated this output.

Examples: 'openai', 'ollama', 'huggingface', 'watsonx'.
Populated by backends. None if unavailable.
"""

# Used for tracking generation.
self._context: list[Component | CBlock] | None = None
Expand Down Expand Up @@ -233,6 +256,9 @@ def _copy_from(self, other: ModelOutputThunk) -> None:
self.parsed_repr = other.parsed_repr
self.tool_calls = other.tool_calls
self._thinking = other._thinking
self.usage = other.usage
self.model = other.model
self.provider = other.provider
self._generate_log = other._generate_log

def is_computed(self) -> bool:
Expand Down Expand Up @@ -433,6 +459,9 @@ def __copy__(self) -> ModelOutputThunk:
copied._context = self._context
copied._generate_log = self._generate_log
copied._model_options = self._model_options
copied.usage = self.usage
copied.model = self.model
copied.provider = self.provider
return copied

def __deepcopy__(self, memo: dict) -> ModelOutputThunk:
Expand Down Expand Up @@ -462,6 +491,9 @@ def __deepcopy__(self, memo: dict) -> ModelOutputThunk:
) # The items in a context should be immutable.
deepcopied._generate_log = copy(self._generate_log)
deepcopied._model_options = copy(self._model_options)
deepcopied.usage = deepcopy(self.usage) if self.usage else None
deepcopied.model = self.model
deepcopied.provider = self.provider
return deepcopied


Expand Down
46 changes: 30 additions & 16 deletions mellea/telemetry/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,30 +407,24 @@ def _get_token_counters() -> tuple[Any, Any]:


def record_token_usage_metrics(
input_tokens: int | None,
output_tokens: int | None,
model: str,
backend: str,
system: str,
input_tokens: int | None, output_tokens: int | None, model: str, provider: str
) -> None:
"""Record token usage metrics following Gen-AI semantic conventions.
"""Record token usage metrics following OpenTelemetry Gen-AI semantic conventions.

This is a no-op when metrics are disabled, ensuring zero overhead.

Args:
input_tokens: Number of input tokens (prompt tokens), or None if unavailable
output_tokens: Number of output tokens (completion tokens), or None if unavailable
model: Model identifier (e.g., "gpt-4", "llama2:7b")
backend: Backend class name (e.g., "OpenAIBackend", "OllamaBackend")
system: Gen-AI system name (e.g., "openai", "ollama", "watsonx")
provider: Provider name (e.g., "openai", "ollama", "watsonx")

Example:
record_token_usage_metrics(
input_tokens=150,
output_tokens=50,
model="llama2:7b",
backend="OllamaBackend",
system="ollama"
provider="ollama"
)
"""
# Early return if metrics are disabled (zero overhead)
Expand All @@ -440,12 +434,8 @@ def record_token_usage_metrics(
# Get the token counters (lazily initialized)
input_counter, output_counter = _get_token_counters()

# Prepare attributes following Gen-AI semantic conventions
attributes = {
"gen_ai.system": system,
"gen_ai.request.model": model,
"mellea.backend": backend,
}
# Prepare attributes following OTel Gen-AI semantic conventions
attributes = {"gen_ai.provider.name": provider, "gen_ai.request.model": model}

# Record input tokens if available
if input_tokens is not None and input_tokens > 0:
Expand All @@ -456,6 +446,30 @@ def record_token_usage_metrics(
output_counter.add(output_tokens, attributes)


# Auto-register TokenMetricsPlugin when metrics are enabled
if _OTEL_AVAILABLE and _METRICS_ENABLED:
try:
from mellea.plugins.registry import register
from mellea.telemetry.metrics_plugins import TokenMetricsPlugin

# Idempotent registration (supports module reloads in tests)
try:
register(TokenMetricsPlugin())
except ValueError as e:
# Already registered (expected during module reloads in tests)
warnings.warn(
f"TokenMetricsPlugin already registered: {e}", UserWarning, stacklevel=2
)
except ImportError:
warnings.warn(
"Metrics are enabled but the plugin framework is not installed. "
"Token usage metrics will not be recorded automatically. "
"Install with: pip install mellea[telemetry]",
UserWarning,
stacklevel=2,
)


__all__ = [
"create_counter",
"create_histogram",
Expand Down
Loading
Loading