diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/CHANGELOG.md b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/CHANGELOG.md new file mode 100644 index 000000000..62fb6539b --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/CHANGELOG.md @@ -0,0 +1,22 @@ +# Changelog + +All notable changes to the LoongSuite BFCL v4 instrumentation are documented +in this file. + +## Unreleased + +### Added + +- Initial release of `loongsuite-instrumentation-bfclv4`. +- ENTRY span around `bfcl_eval._llm_response_generation.generate_results`. +- AGENT span around `bfcl_eval.model_handler.base_handler.BaseHandler.inference` + with cross-thread OTel context propagation via a narrow patch of + `bfcl_eval._llm_response_generation.ThreadPoolExecutor`. +- STEP spans created by reflectively wrapping each handler's + `_query_FC` / `_query_prompting` (discovered via + `bfcl_eval.constants.model_config.MODEL_CONFIG_MAPPING`). +- Per-call TOOL spans emitted by wrapping + `bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils.execute_multi_turn_func_call`. +- Provider override mapping for OSS handlers (vLLM / SGLang). +- Multi-turn `bfcl.turn_idx` and ReAct `gen_ai.react.round` tracking via + `contextvars`. diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/README.md b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/README.md new file mode 100644 index 000000000..7a4e5d69d --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/README.md @@ -0,0 +1,79 @@ +# LoongSuite BFCL v4 Instrumentation + +LoongSuite Python instrumentation for the [Berkeley Function Call +Leaderboard v4](https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard) +(`bfcl-eval`, package `bfcl_eval`). + +## Span Topology + +``` +ENTRY enter_ai_application_system gen_ai.span.kind=ENTRY, op=enter +└─ AGENT invoke_agent {test_entry_id} gen_ai.span.kind=AGENT, op=invoke_agent + ├─ STEP react step gen_ai.span.kind=STEP, op=react + │ ├─ LLM chat {model} (created by downstream vendor SDK probe) + │ └─ TOOL execute_tool {fn} gen_ai.span.kind=TOOL, op=execute_tool + └─ STEP react step + └─ ... +``` + +This instrumentation deliberately does **not** create LLM spans. They are +emitted by the downstream vendor SDK probe (OpenAI / Anthropic / Google / +DashScope / LiteLLM / etc.) so that token usage and request payloads stay in +sync with the SDK that actually performed the request. + +## Installation + +```bash +pip install loongsuite-instrumentation-bfclv4 +``` + +## Usage + +```bash +opentelemetry-instrument bfcl generate \ + --model gpt-4o-2024-11-20-FC \ + --test-category simple_python \ + --num-threads 2 +``` + +Or programmatically: + +```python +from opentelemetry.instrumentation.bfclv4 import BFCLv4Instrumentor + +BFCLv4Instrumentor().instrument() +# ... run BFCL ... +BFCLv4Instrumentor().uninstrument() +``` + +## Compatibility With Downstream LLM SDK Probes + +| Scenario | Recommended downstream probe | +| --- | --- | +| OpenAI / OpenAI Responses / OSS via vLLM / SGLang / DeepSeek (OpenAI-compatible) | `opentelemetry-instrumentation-openai` | +| Anthropic / Claude | `loongsuite-instrumentation-claude-agent-sdk` | +| Gemini / Google | `loongsuite-instrumentation-google-adk` | +| Qwen / DashScope | `loongsuite-instrumentation-dashscope` | +| LiteLLM | `loongsuite-instrumentation-litellm` | + +## OSS Provider Notes + +For OSS handlers (vLLM / SGLang served via the OpenAI-compatible API), the +BFCL probe sets `gen_ai.provider.name` to `vllm` / `sglang` / `oss` and adds +`bfcl.oss.backend` for disambiguation. Downstream OpenAI probes will still +report `gen_ai.provider.name=openai` on the LLM span; this is expected. + +## Custom Attributes + +| Attribute | Where | Description | +| --- | --- | --- | +| `gen_ai.framework` = `bfclv4` | ENTRY/AGENT/STEP/TOOL | Framework tag | +| `bfcl.test_category` | ENTRY/AGENT | Test category | +| `bfcl.num_threads` | ENTRY | Configured thread pool size | +| `bfcl.test_case_count` | ENTRY | Number of test cases | +| `bfcl.run_ids` | ENTRY | Whether the run targeted specific IDs | +| `bfcl.test_entry_id` | AGENT | Test entry id | +| `bfcl.turn_idx` | STEP | Multi-turn turn index (0-based) | +| `bfcl.query_mode` | STEP | `FC` or `prompting` | +| `bfcl.oss.backend` | AGENT/STEP | `vllm` / `sglang` / `unknown` (only OSS) | +| `bfcl.tool.duration_is_estimated` | TOOL | True (latency is averaged across batch) | diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/pyproject.toml new file mode 100644 index 000000000..3eeb5d026 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/pyproject.toml @@ -0,0 +1,54 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "loongsuite-instrumentation-bfclv4" +dynamic = ["version"] +description = "LoongSuite BFCL v4 (Berkeley Function Call Leaderboard) instrumentation" +readme = "README.md" +license = "Apache-2.0" +requires-python = ">=3.10,<4" +authors = [ + { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +dependencies = [ + "opentelemetry-api >= 1.37.0", + "opentelemetry-instrumentation >= 0.58b0", + "opentelemetry-semantic-conventions >= 0.58b0", + "wrapt >= 1.0.0, < 2.0.0", + "opentelemetry-util-genai >= 0.3b0.dev0", +] + +[project.optional-dependencies] +instruments = [ + "bfcl-eval >= 4.0.0", +] + +[project.entry-points.opentelemetry_instrumentor] +bfclv4 = "opentelemetry.instrumentation.bfclv4:BFCLv4Instrumentor" + +[project.urls] +Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4" +Repository = "https://github.com/alibaba/loongsuite-python-agent" + +[tool.hatch.version] +path = "src/opentelemetry/instrumentation/bfclv4/version.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/opentelemetry"] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/__init__.py new file mode 100644 index 000000000..6a7729940 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/__init__.py @@ -0,0 +1,322 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""LoongSuite BFCL v4 (Berkeley Function Call Leaderboard) instrumentation. + +Usage +----- + +.. code:: python + + from opentelemetry.instrumentation.bfclv4 import BFCLv4Instrumentor + + BFCLv4Instrumentor().instrument() + # ... run BFCL ... + BFCLv4Instrumentor().uninstrument() + +API +--- +""" + +from __future__ import annotations + +import importlib +import logging +from typing import Any, Collection, List, Tuple + +from wrapt import wrap_function_wrapper + +from opentelemetry.instrumentation.bfclv4.internal.wrappers import ( + BaseHandlerInferenceWrapper, + ExecuteFuncCallWrapper, + GenerateResultsWrapper, + QueryWrapper, + TurnBumpWrapper, +) +from opentelemetry.instrumentation.bfclv4.package import _instruments +from opentelemetry.instrumentation.bfclv4.utils import GenAIHookHelper +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor +from opentelemetry.instrumentation.utils import unwrap + +logger = logging.getLogger(__name__) + +__all__ = ["BFCLv4Instrumentor"] + + +_GENERATE_RESULTS_MODULE = "bfcl_eval._llm_response_generation" +_GENERATE_RESULTS_NAME = "generate_results" + +_BASE_HANDLER_MODULE = "bfcl_eval.model_handler.base_handler" +_BASE_HANDLER_NAME = "BaseHandler.inference" + +_EXECUTE_TOOL_MODULE = ( + "bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils" +) +_EXECUTE_TOOL_NAME = "execute_multi_turn_func_call" + + +# ``MODEL_CONFIG_MAPPING`` already imports every concrete handler at module +# load time, so iterating over its values gives us the canonical handler +# class set without risking new vendor SDK imports. +def _iter_handler_classes() -> List[type]: + try: + from bfcl_eval.constants.model_config import ( # noqa: PLC0415 + MODEL_CONFIG_MAPPING, + ) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: cannot import MODEL_CONFIG_MAPPING: %s", exc + ) + return [] + + classes: List[type] = [] + seen_class_ids: set[int] = set() + for cfg in MODEL_CONFIG_MAPPING.values(): + cls = getattr(cfg, "model_handler", None) + if cls is None or not isinstance(cls, type): + continue + if id(cls) in seen_class_ids: + continue + seen_class_ids.add(id(cls)) + classes.append(cls) + return classes + + +class BFCLv4Instrumentor(BaseInstrumentor): + """An instrumentor for the BFCL v4 (``bfcl_eval``) framework.""" + + def __init__(self) -> None: + super().__init__() + if not hasattr(self, "_wrapped_query_methods"): + self._wrapped_query_methods: List[Tuple[type, str]] = [] + if not hasattr(self, "_wrapped_turn_methods"): + self._wrapped_turn_methods: List[Tuple[type, str]] = [] + if not hasattr(self, "_entry_wrapped"): + self._entry_wrapped = False + if not hasattr(self, "_inference_wrapped"): + self._inference_wrapped = False + if not hasattr(self, "_tool_wrapped"): + self._tool_wrapped = False + if not hasattr(self, "_tool_targets"): + self._tool_targets: List[Tuple[str, str]] = [] + + def instrumentation_dependencies(self) -> Collection[str]: + return _instruments + + # ------------------------------------------------------------------ + # _instrument + + def _instrument(self, **kwargs: Any) -> None: # noqa: D401 + helper = GenAIHookHelper() + + # 1) ENTRY ----------------------------------------------------- + try: + wrap_function_wrapper( + _GENERATE_RESULTS_MODULE, + _GENERATE_RESULTS_NAME, + GenerateResultsWrapper(helper), + ) + self._entry_wrapped = True + except Exception as exc: # noqa: BLE001 + logger.warning( + "bfclv4: failed to wrap %s.%s: %s", + _GENERATE_RESULTS_MODULE, + _GENERATE_RESULTS_NAME, + exc, + ) + + # 2) AGENT ----------------------------------------------------- + try: + wrap_function_wrapper( + _BASE_HANDLER_MODULE, + _BASE_HANDLER_NAME, + BaseHandlerInferenceWrapper(helper), + ) + self._inference_wrapped = True + except Exception as exc: # noqa: BLE001 + logger.warning( + "bfclv4: failed to wrap %s.%s: %s", + _BASE_HANDLER_MODULE, + _BASE_HANDLER_NAME, + exc, + ) + + # 3) STEP + 4) turn maintenance -------------------------------- + self._instrument_handlers(helper) + + # 5) TOOL ------------------------------------------------------ + # ``execute_multi_turn_func_call`` is re-exported via ``from ... import`` + # in several BFCL modules, so wrapping just the source module misses + # the call sites that use the local binding. We wrap each known + # re-export site as well to guarantee the TOOL span is always emitted. + tool_targets = [ + (_EXECUTE_TOOL_MODULE, _EXECUTE_TOOL_NAME), + ( + "bfcl_eval.model_handler.base_handler", + _EXECUTE_TOOL_NAME, + ), + ( + "bfcl_eval.eval_checker.multi_turn_eval.multi_turn_checker", + _EXECUTE_TOOL_NAME, + ), + ] + wrapper_instance = ExecuteFuncCallWrapper(helper) + self._tool_targets = [] + for module_name, attr_name in tool_targets: + try: + wrap_function_wrapper( + module_name, + attr_name, + wrapper_instance, + ) + self._tool_targets.append((module_name, attr_name)) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to wrap %s.%s: %s", + module_name, + attr_name, + exc, + ) + self._tool_wrapped = bool(self._tool_targets) + + def _instrument_handlers(self, helper: GenAIHookHelper) -> None: + # Reflectively wrap every concrete ``_query_FC`` / ``_query_prompting`` + # plus the turn-maintenance helpers; we de-duplicate by function id so + # subclasses that share an inherited implementation are wrapped only + # once. + seen_func_ids: set[int] = set() + + query_pairs = ( + ("_query_FC", "FC"), + ("_query_prompting", "prompting"), + ) + turn_pairs = ( + ("add_first_turn_message_FC", True), + ("add_first_turn_message_prompting", True), + ("_add_next_turn_user_message_FC", False), + ("_add_next_turn_user_message_prompting", False), + ) + + for cls in _iter_handler_classes(): + class_dict = getattr(cls, "__dict__", {}) + for method_name, mode in query_pairs: + method = class_dict.get(method_name) + if method is None or not callable(method): + continue + key = id(method) + if key in seen_func_ids: + continue + seen_func_ids.add(key) + try: + wrap_function_wrapper( + cls.__module__, + f"{cls.__name__}.{method_name}", + QueryWrapper(helper, mode), + ) + self._wrapped_query_methods.append((cls, method_name)) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to wrap %s.%s.%s: %s", + cls.__module__, + cls.__name__, + method_name, + exc, + ) + + for method_name, is_first in turn_pairs: + method = class_dict.get(method_name) + if method is None or not callable(method): + continue + key = id(method) + if key in seen_func_ids: + continue + seen_func_ids.add(key) + try: + wrap_function_wrapper( + cls.__module__, + f"{cls.__name__}.{method_name}", + TurnBumpWrapper(reset=is_first), + ) + self._wrapped_turn_methods.append((cls, method_name)) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to wrap %s.%s.%s: %s", + cls.__module__, + cls.__name__, + method_name, + exc, + ) + + # ------------------------------------------------------------------ + # _uninstrument + + def _uninstrument(self, **kwargs: Any) -> None: # noqa: D401 + if self._tool_wrapped: + for module_name, attr_name in getattr(self, "_tool_targets", []): + try: + module = importlib.import_module(module_name) + unwrap(module, attr_name) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to unwrap %s.%s: %s", + module_name, + attr_name, + exc, + ) + self._tool_targets = [] + self._tool_wrapped = False + + for cls, method_name in self._wrapped_query_methods: + try: + unwrap(cls, method_name) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to unwrap %s.%s: %s", + cls.__name__, + method_name, + exc, + ) + self._wrapped_query_methods = [] + + for cls, method_name in self._wrapped_turn_methods: + try: + unwrap(cls, method_name) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to unwrap %s.%s: %s", + cls.__name__, + method_name, + exc, + ) + self._wrapped_turn_methods = [] + + if self._inference_wrapped: + try: + base_module = importlib.import_module(_BASE_HANDLER_MODULE) + unwrap(base_module.BaseHandler, "inference") + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to unwrap BaseHandler.inference: %s", exc + ) + self._inference_wrapped = False + + if self._entry_wrapped: + try: + module = importlib.import_module(_GENERATE_RESULTS_MODULE) + unwrap(module, _GENERATE_RESULTS_NAME) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to unwrap generate_results: %s", exc + ) + self._entry_wrapped = False diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/__init__.py new file mode 100644 index 000000000..b0a6f4284 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/__init__.py @@ -0,0 +1,13 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/attributes.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/attributes.py new file mode 100644 index 000000000..774200aba --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/attributes.py @@ -0,0 +1,38 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Constant attribute keys used by the BFCL v4 instrumentation.""" + +from __future__ import annotations + +from typing import Final + +FRAMEWORK_NAME: Final = "bfclv4" + +# gen_ai.* attribute keys that are not exported by +# opentelemetry-semantic-conventions today. +GEN_AI_FRAMEWORK: Final = "gen_ai.framework" +GEN_AI_PROVIDER_NAME: Final = "gen_ai.provider.name" + +# BFCL-specific (vendor) attribute keys. +BFCL_TEST_CATEGORY: Final = "bfcl.test_category" +BFCL_NUM_THREADS: Final = "bfcl.num_threads" +BFCL_TEST_CASE_COUNT: Final = "bfcl.test_case_count" +BFCL_RUN_IDS: Final = "bfcl.run_ids" +BFCL_TEST_ENTRY_ID: Final = "bfcl.test_entry_id" +BFCL_TURN_IDX: Final = "bfcl.turn_idx" +BFCL_QUERY_MODE: Final = "bfcl.query_mode" +BFCL_OSS_BACKEND: Final = "bfcl.oss.backend" +BFCL_TOOL_DURATION_IS_ESTIMATED: Final = "bfcl.tool.duration_is_estimated" +BFCL_TOOL_INDEX: Final = "bfcl.tool.index" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/provider.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/provider.py new file mode 100644 index 000000000..efa2c77dc --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/provider.py @@ -0,0 +1,71 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Map BFCL ``ModelStyle`` enum values to ``gen_ai.provider.name``.""" + +from __future__ import annotations + +import os +from typing import Any, Dict, Tuple + +from opentelemetry.instrumentation.bfclv4.internal.attributes import ( + BFCL_OSS_BACKEND, +) + +# The BFCL backend name (vllm / sglang / ...) is communicated from the ENTRY +# wrapper to the per-thread STEP/AGENT wrappers via this env var. The ENTRY +# wrapper writes to it before invoking the wrapped function and clears it in +# the ``finally`` clause. +OSS_BACKEND_ENV = "BFCL_BACKEND" + + +def infer_provider(handler: Any) -> Tuple[str, Dict[str, Any]]: + """Return ``(provider_name, extra_attributes)`` for a BFCL handler. + + Falls back to ``"unknown"`` if BFCL is not importable or if the handler + has no ``model_style`` attribute. + """ + + try: + from bfcl_eval.constants.enums import ( # noqa: PLC0415 + ModelStyle, + ) + except ImportError: + return "unknown", {} + + style = getattr(handler, "model_style", None) + if style is None: + return "unknown", {} + + if style is ModelStyle.OSSMODEL: + backend = (os.getenv(OSS_BACKEND_ENV) or "").lower() + if backend in ("vllm", "sglang"): + return backend, {BFCL_OSS_BACKEND: backend} + return "oss", {BFCL_OSS_BACKEND: "unknown"} + + mapping = { + ModelStyle.OPENAI_COMPLETIONS: "openai", + ModelStyle.OPENAI_RESPONSES: "openai", + ModelStyle.ANTHROPIC: "anthropic", + ModelStyle.GOOGLE: "gcp.gemini", + ModelStyle.MISTRAL: "mistral_ai", + ModelStyle.COHERE: "cohere", + ModelStyle.AMAZON: "aws.bedrock", + ModelStyle.FIREWORK_AI: "fireworks_ai", + ModelStyle.WRITER: "writer", + ModelStyle.NOVITA_AI: "novita", + ModelStyle.NEXUS: "nexusflow", + ModelStyle.GORILLA: "gorilla", + } + return mapping.get(style, "unknown"), {} diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/state.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/state.py new file mode 100644 index 000000000..ae4861035 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/state.py @@ -0,0 +1,93 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Per-thread ReAct state for the BFCL v4 instrumentation. + +We use ``contextvars.ContextVar`` so that each worker thread spawned by the +BFCL ``ThreadPoolExecutor`` gets its own copy. ``_ContextPropagatingExecutor`` +in :mod:`threading_propagation` makes sure ENTRY-time context is copied into +the worker thread; the BaseHandler.inference wrapper then initializes a fresh +state on top of that copy. +""" + +from __future__ import annotations + +import contextvars +from typing import Any, Dict, Optional + +_REACT_STATE: contextvars.ContextVar[Optional[Dict[str, Any]]] = ( + contextvars.ContextVar("bfclv4_react_state", default=None) +) + + +def init_state() -> contextvars.Token: + """Initialise per-AGENT state and return the reset token.""" + state: Dict[str, Any] = { + # ``turn_idx`` is incremented by the wrapper around + # ``_add_next_turn_user_message_*``; it stays ``0`` for single-turn + # tests. + "turn_idx": 0, + # ``fc_round`` is the ReAct round counter. We bump it on every STEP + # entry so the first STEP within a turn ends up with ``round=1``. + "fc_round": 0, + # Counter of executed tool calls within the current AGENT - useful for + # the TOOL span ``tool_call_id`` synthesis. + "tool_index": 0, + } + return _REACT_STATE.set(state) + + +def reset_state(token: contextvars.Token) -> None: + try: + _REACT_STATE.reset(token) + except (LookupError, ValueError): + # Token may have already been reset (e.g. nested error path). + pass + + +def get_state() -> Optional[Dict[str, Any]]: + return _REACT_STATE.get() + + +def bump_round() -> int: + state = _REACT_STATE.get() + if state is None: + return 1 + state["fc_round"] = state.get("fc_round", 0) + 1 + return state["fc_round"] + + +def reset_round_for_turn() -> None: + state = _REACT_STATE.get() + if state is None: + return + state["fc_round"] = 0 + + +def bump_turn() -> int: + state = _REACT_STATE.get() + if state is None: + return 0 + state["turn_idx"] = state.get("turn_idx", 0) + 1 + state["fc_round"] = 0 + return state["turn_idx"] + + +def next_tool_index() -> int: + state = _REACT_STATE.get() + if state is None: + return 0 + idx = state.get("tool_index", 0) + state["tool_index"] = idx + 1 + return idx diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/threading_propagation.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/threading_propagation.py new file mode 100644 index 000000000..d19c05799 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/threading_propagation.py @@ -0,0 +1,43 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Context-propagating ``ThreadPoolExecutor`` used by the ENTRY wrapper. + +``concurrent.futures.ThreadPoolExecutor`` does not automatically copy the +current ``contextvars`` context (which holds the OTel current span) into +worker threads. We subclass it and copy ``contextvars.copy_context()`` per +``submit`` so the AGENT span created inside the worker thread can attach as +a child of the ENTRY span. + +We only swap the ``ThreadPoolExecutor`` *name* in the +``bfcl_eval._llm_response_generation`` namespace; the global +``concurrent.futures.ThreadPoolExecutor`` is untouched. +""" + +from __future__ import annotations + +import contextvars +from concurrent.futures import ThreadPoolExecutor as _RealExecutor + + +class ContextPropagatingExecutor(_RealExecutor): + """``ThreadPoolExecutor`` that propagates the calling ``Context``. + + Only the ``submit`` method is overridden because BFCL only uses + ``submit`` (see ``_llm_response_generation.generate_results``). + """ + + def submit(self, fn, /, *args, **kwargs): # type: ignore[override] + ctx = contextvars.copy_context() + return super().submit(ctx.run, fn, *args, **kwargs) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py new file mode 100644 index 000000000..cb9bd3f34 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py @@ -0,0 +1,694 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Wrapper classes for the BFCL v4 instrumentation. + +Each wrapper follows the standard ``wrapt`` callable contract:: + + def __call__(self, wrapped, instance, args, kwargs): + ... + +All wrappers rely on :func:`get_extended_telemetry_handler` (LoongSuite +``util-genai``) to create the actual spans, so that ENTRY / AGENT / STEP / +TOOL spans get the canonical ``gen_ai.span.kind`` and operation-name values +that the LoongSuite semantic-validator expects. +""" + +from __future__ import annotations + +import logging +import os +import time +from typing import Any, Callable, Iterable, List, Optional + +from opentelemetry.instrumentation.bfclv4.internal.attributes import ( + BFCL_NUM_THREADS, + BFCL_OSS_BACKEND, + BFCL_QUERY_MODE, + BFCL_RUN_IDS, + BFCL_TEST_CASE_COUNT, + BFCL_TEST_CATEGORY, + BFCL_TEST_ENTRY_ID, + BFCL_TOOL_DURATION_IS_ESTIMATED, + BFCL_TOOL_INDEX, + BFCL_TURN_IDX, + FRAMEWORK_NAME, + GEN_AI_FRAMEWORK, + GEN_AI_PROVIDER_NAME, +) +from opentelemetry.instrumentation.bfclv4.internal.provider import ( + OSS_BACKEND_ENV, + infer_provider, +) +from opentelemetry.instrumentation.bfclv4.internal.state import ( + bump_round, + bump_turn, + init_state, + next_tool_index, + reset_state, +) +from opentelemetry.instrumentation.bfclv4.internal.threading_propagation import ( + ContextPropagatingExecutor, +) +from opentelemetry.instrumentation.bfclv4.utils import ( + GenAIHookHelper, + to_text_input, + to_text_output, + truncate_text, +) +from opentelemetry.util.genai.extended_handler import ( + get_extended_telemetry_handler, +) +from opentelemetry.util.genai.extended_types import ( + EntryInvocation, + ExecuteToolInvocation, + InvokeAgentInvocation, + ReactStepInvocation, +) + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Helpers + + +def _safe_get(obj: Any, key: str, default: Any = None) -> Any: + if isinstance(obj, dict): + return obj.get(key, default) + return getattr(obj, key, default) + + +def _flatten_tokens(value: Any) -> Optional[int]: + """Sum a possibly nested ``int|float|list|list[list]`` BFCL token field.""" + if value is None: + return None + if isinstance(value, (int, float)): + return int(value) + if isinstance(value, Iterable): + total = 0 + any_seen = False + for item in value: + sub = _flatten_tokens(item) + if sub is not None: + total += sub + any_seen = True + if any_seen: + return total + return None + + +def _test_category_from_id(test_entry_id: Optional[str]) -> Optional[str]: + if not test_entry_id or "_" not in test_entry_id: + return None + return test_entry_id.rsplit("_", 1)[0] + + +def _join_test_category(value: Any) -> Optional[str]: + if value is None: + return None + if isinstance(value, str): + return value + if isinstance(value, (list, tuple, set)): + joined = ",".join(str(v) for v in value if v is not None) + return joined or None + return str(value) + + +# --------------------------------------------------------------------------- +# ENTRY wrapper + + +class GenerateResultsWrapper: + """Wraps ``bfcl_eval._llm_response_generation.generate_results``. + + Responsibilities: + + * Open the ENTRY span (``enter_ai_application_system``). + * Temporarily swap the ``ThreadPoolExecutor`` reference inside the BFCL + generation module to a context-propagating subclass so that AGENT spans + created in worker threads inherit the ENTRY span as parent. + * Publish ``args.backend`` to ``BFCL_BACKEND`` so that + :func:`infer_provider` can attribute OSS spans to vllm / sglang. + """ + + def __init__(self, helper: GenAIHookHelper) -> None: + self._helper = helper + + def __call__(self, wrapped: Callable, instance: Any, args, kwargs): # noqa: D401 + # ``generate_results(args, model_name, test_cases_total)`` + cli_args = args[0] if len(args) >= 1 else kwargs.get("args") + model_name = args[1] if len(args) >= 2 else kwargs.get("model_name") + test_cases_total = ( + args[2] if len(args) >= 3 else kwargs.get("test_cases_total") + ) + + try: + from bfcl_eval import ( # noqa: PLC0415 + _llm_response_generation as _bfcl_gen, + ) + except ImportError: + return wrapped(*args, **kwargs) + + original_executor = getattr(_bfcl_gen, "ThreadPoolExecutor", None) + if original_executor is not None: + _bfcl_gen.ThreadPoolExecutor = ContextPropagatingExecutor + + backend_value = ( + _safe_get(cli_args, "backend", None) if cli_args is not None else None + ) + previous_backend_env = os.environ.get(OSS_BACKEND_ENV) + if backend_value: + os.environ[OSS_BACKEND_ENV] = str(backend_value) + + session_id_default = None + if model_name is not None: + try: + session_id_default = f"{model_name}@{int(time.time())}" + except Exception: # noqa: BLE001 + session_id_default = None + session_id = ( + os.environ.get("BFCL_SESSION_ID") or session_id_default + ) + + entry_inv = EntryInvocation(session_id=session_id) + handler = get_extended_telemetry_handler() + + attributes = {GEN_AI_FRAMEWORK: FRAMEWORK_NAME} + category_value = _join_test_category( + _safe_get(cli_args, "test_category", None) + ) + if category_value: + attributes[BFCL_TEST_CATEGORY] = category_value + num_threads = _safe_get(cli_args, "num_threads", None) + if num_threads is not None: + try: + attributes[BFCL_NUM_THREADS] = int(num_threads) + except (TypeError, ValueError): + pass + if isinstance(test_cases_total, (list, tuple)): + attributes[BFCL_TEST_CASE_COUNT] = len(test_cases_total) + attributes[BFCL_RUN_IDS] = bool( + _safe_get(cli_args, "run_ids", False) + ) + + try: + with handler.entry(entry_inv) as inv: + if inv.span is not None and inv.span.is_recording(): + for key, value in attributes.items(): + try: + inv.span.set_attribute(key, value) + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 ENTRY set_attribute(%s) failed", + key, + exc_info=True, + ) + return wrapped(*args, **kwargs) + finally: + if original_executor is not None: + try: + _bfcl_gen.ThreadPoolExecutor = original_executor + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 ENTRY: failed to restore ThreadPoolExecutor", + exc_info=True, + ) + if backend_value: + if previous_backend_env is None: + os.environ.pop(OSS_BACKEND_ENV, None) + else: + os.environ[OSS_BACKEND_ENV] = previous_backend_env + + +# --------------------------------------------------------------------------- +# AGENT wrapper + + +_BFCL_INFERENCE_ERROR_PREFIX = "Error during inference:" + + +class BaseHandlerInferenceWrapper: + """Wraps ``BaseHandler.inference``. + + Creates the AGENT span (kind=AGENT, op=invoke_agent) and initialises the + per-thread ReAct state used by the STEP wrapper. + + BFCL's outer ``multi_threaded_inference`` catches every exception and + converts it into a ``"Error during inference: ..."`` string; we mirror + that behaviour by setting the AGENT span status to ERROR when the + returned ``result`` looks like an error string, instead of relying on + a re-raised exception. + """ + + def __init__(self, helper: GenAIHookHelper) -> None: + self._helper = helper + + def __call__(self, wrapped: Callable, instance: Any, args, kwargs): # noqa: D401 + # ``inference(self, test_entry, include_input_log, exclude_state_log)`` + test_entry = args[0] if args else kwargs.get("test_entry") + if not isinstance(test_entry, dict): + return wrapped(*args, **kwargs) + + provider, extra_attrs = infer_provider(instance) + request_model = getattr(instance, "model_name", None) + test_entry_id = test_entry.get("id") + category = _test_category_from_id(test_entry_id) + involved_classes = test_entry.get("involved_classes") or [] + agent_description = ( + ", ".join(str(c) for c in involved_classes) + if isinstance(involved_classes, (list, tuple)) + else None + ) + + invocation = InvokeAgentInvocation( + provider=provider or "unknown", + request_model=request_model, + agent_id=test_entry_id, + agent_name=category or "bfcl_agent", + agent_description=agent_description or None, + conversation_id=test_entry_id, + ) + + token = init_state() + handler = get_extended_telemetry_handler() + try: + with handler.invoke_agent(invocation) as inv: + if inv.span is not None and inv.span.is_recording(): + inv.span.set_attribute(GEN_AI_FRAMEWORK, FRAMEWORK_NAME) + if provider: + inv.span.set_attribute(GEN_AI_PROVIDER_NAME, provider) + if test_entry_id is not None: + inv.span.set_attribute( + BFCL_TEST_ENTRY_ID, test_entry_id + ) + if category is not None: + inv.span.set_attribute(BFCL_TEST_CATEGORY, category) + for key, value in extra_attrs.items(): + if value is not None: + inv.span.set_attribute(key, value) + + # Capture inputs for the AGENT (gated by content-capture mode). + question = test_entry.get("question") + if question is not None: + inv.input_messages = to_text_input( + "user", truncate_text(_safe_str(question)) + ) + + # Run the original inference call. + try: + result = wrapped(*args, **kwargs) + except Exception as exc: + # The CM will mark the span as failed; we leave it to + # the handler/CM to call ``fail_invoke_agent``. + raise exc + + # Detect BFCL's own captured error path (no exception raised + # but the returned result is the error string). + result_payload = ( + result[0] if isinstance(result, tuple) and result else None + ) + metadata_payload = ( + result[1] + if isinstance(result, tuple) and len(result) >= 2 + else None + ) + + if ( + isinstance(result_payload, str) + and result_payload.startswith(_BFCL_INFERENCE_ERROR_PREFIX) + and inv.span is not None + and inv.span.is_recording() + ): + try: + from opentelemetry.trace import Status, StatusCode + + inv.span.set_status( + Status(StatusCode.ERROR, result_payload[:200]) + ) + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 AGENT: failed to set ERROR status", + exc_info=True, + ) + + if isinstance(metadata_payload, dict): + input_tokens = _flatten_tokens( + metadata_payload.get("input_token_count") + ) + output_tokens = _flatten_tokens( + metadata_payload.get("output_token_count") + ) + if input_tokens is not None: + inv.input_tokens = input_tokens + if output_tokens is not None: + inv.output_tokens = output_tokens + + if result_payload is not None: + inv.output_messages = to_text_output( + "assistant", + truncate_text(_safe_str(result_payload)), + ) + + return result + finally: + reset_state(token) + + +def _safe_str(value: Any) -> str: + try: + if isinstance(value, str): + return value + import json + + return json.dumps(value, ensure_ascii=False, default=str) + except Exception: # noqa: BLE001 + try: + return str(value) + except Exception: # noqa: BLE001 + return "" + + +# --------------------------------------------------------------------------- +# STEP wrapper + + +class QueryWrapper: + """Wraps ``._query_FC`` / ``_query_prompting``. + + Creates a ReAct STEP span, attaches token usage by re-calling the + handler's matching ``_parse_query_response_*`` (which is documented as + side-effect-free). + """ + + def __init__(self, helper: GenAIHookHelper, mode: str) -> None: + self._helper = helper + self._mode = mode # "FC" or "prompting" + + def __call__(self, wrapped: Callable, instance: Any, args, kwargs): # noqa: D401 + round_idx = bump_round() + provider, extra_attrs = infer_provider(instance) + + invocation = ReactStepInvocation(round=round_idx) + handler_obj = get_extended_telemetry_handler() + with handler_obj.react_step(invocation) as step_inv: + span = step_inv.span + if span is not None and span.is_recording(): + span.set_attribute(GEN_AI_FRAMEWORK, FRAMEWORK_NAME) + span.set_attribute(BFCL_QUERY_MODE, self._mode) + if provider: + span.set_attribute(GEN_AI_PROVIDER_NAME, provider) + model_name = getattr(instance, "model_name", None) + if model_name: + span.set_attribute( + "gen_ai.request.model", str(model_name) + ) + from opentelemetry.instrumentation.bfclv4.internal.state import ( + get_state, + ) + + state = get_state() + if state is not None: + span.set_attribute(BFCL_TURN_IDX, state.get("turn_idx", 0)) + for key, value in extra_attrs.items(): + if value is not None: + span.set_attribute(key, value) + + try: + api_response, query_latency = wrapped(*args, **kwargs) + except Exception: + # Let the context-manager mark the span as failed; the BFCL + # outer try/except will turn this into an "Error during + # inference: ..." result string at the AGENT layer. + raise + + # When the underlying handler returns a streaming wrapper + # (e.g. ``ChatStreamWrapper`` from openai-v2), the LLM span and + # its OTel context attach are kept alive until the stream is + # consumed by BFCL's ``_parse_query_response_*`` *outside* of + # this STEP context manager. That breaks the LIFO ordering of + # context attach/detach, leaving the LLM span as the "current" + # span after the STEP CM exits, which causes the next STEP and + # any TOOL spans to be parented to the previous STEP rather + # than to the AGENT. + # + # To preserve LIFO ordering, force-consume the stream here + # (inside the STEP context) and replace it with a plain + # iterator over the cached chunks. This makes ``stop_llm`` + # (which detaches the LLM context) run *before* STEP detaches. + if api_response is not None and hasattr( + api_response, "__next__" + ) and not isinstance(api_response, (str, bytes)): + try: + chunks = list(api_response) + api_response = iter(chunks) + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 STEP: failed to materialise streaming " + "response; LLM/STEP nesting may be incorrect", + exc_info=True, + ) + + # Post-call attribute enrichment - use try/except so that any + # vendor-side parsing surprise never breaks BFCL itself. + # + # IMPORTANT: We must NOT re-call ``_parse_query_response_*`` here, + # because for streaming providers (e.g. Qwen DashScope) the + # ``api_response`` is a single-pass generator that the parser + # consumes; calling it twice leaves BFCL's own subsequent call to + # the parser with an exhausted iterator, which crashes inference + # with ``UnboundLocalError: chunk``. Token usage will instead be + # recovered later from the AGENT-level metadata payload. + try: + if span is not None and span.is_recording(): + if isinstance(query_latency, (int, float)): + try: + span.set_attribute( + "gen_ai.response.time_to_first_token", + int(float(query_latency) * 1e9), + ) + except Exception: # noqa: BLE001 + pass + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 STEP: post-call enrichment failed", exc_info=True + ) + + return api_response, query_latency + + +def _infer_finish_reason(model_responses: Any) -> str: + """Best-effort heuristic for ``gen_ai.react.finish_reason``.""" + if model_responses is None: + return "unknown" + if isinstance(model_responses, list): + if len(model_responses) == 0: + return "empty_response" + if len(model_responses) == 1 and not model_responses[0]: + return "empty_response" + return "tool_calls" + if isinstance(model_responses, str): + # Prompting models often return decoded strings even when there are + # no tool calls - treat as "stop" so downstream callers know there is + # no further work to do. + return "stop" + return "continue" + + +# --------------------------------------------------------------------------- +# turn_idx maintenance wrappers (no spans) + + +class TurnBumpWrapper: + """Wraps ``.add_first_turn_message_*`` and + ``._add_next_turn_user_message_*`` to keep ``bfcl.turn_idx`` in + sync. No spans are created here. + """ + + def __init__(self, *, reset: bool) -> None: + self._reset = reset + + def __call__(self, wrapped: Callable, instance: Any, args, kwargs): # noqa: D401 + try: + if self._reset: + # ``add_first_turn_message_*`` runs once at the very start of + # multi-turn / single-turn inference. We only want to reset + # to ``turn_idx=0`` here. + from opentelemetry.instrumentation.bfclv4.internal.state import ( + get_state, + ) + + state = get_state() + if state is not None: + state["turn_idx"] = 0 + state["fc_round"] = 0 + else: + bump_turn() + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4: turn_idx maintenance failed", exc_info=True + ) + return wrapped(*args, **kwargs) + + +# --------------------------------------------------------------------------- +# TOOL wrapper + + +class ExecuteFuncCallWrapper: + """Wraps + ``bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils.execute_multi_turn_func_call``. + + BFCL evaluates a list of function-call strings in a single Python call; + we surface each one as its own TOOL span by post-processing the wrapped + result. Per-call latency is approximated by averaging the total elapsed + time across the batch (``bfcl.tool.duration_is_estimated=true``). + """ + + def __init__(self, helper: GenAIHookHelper) -> None: + self._helper = helper + + def __call__(self, wrapped: Callable, instance: Any, args, kwargs): # noqa: D401 + # ``execute_multi_turn_func_call(func_call_list, initial_config, + # involved_classes, model_name, + # test_entry_id, long_context=False, + # is_evaL_run=False)`` + func_call_list = ( + args[0] if args else kwargs.get("func_call_list", []) + ) + model_name = ( + args[3] + if len(args) >= 4 + else kwargs.get("model_name") + ) + test_entry_id = ( + args[4] + if len(args) >= 5 + else kwargs.get("test_entry_id") + ) + + if not isinstance(func_call_list, list) or not func_call_list: + return wrapped(*args, **kwargs) + + t0 = time.perf_counter() + try: + result = wrapped(*args, **kwargs) + finally: + elapsed = max(time.perf_counter() - t0, 0.0) + + execution_results: List[str] = [] + if isinstance(result, tuple) and result: + payload = result[0] + if isinstance(payload, list): + execution_results = list(payload) + + per_call_seconds = ( + elapsed / len(func_call_list) if func_call_list else 0.0 + ) + + handler_obj = get_extended_telemetry_handler() + for index, func_call in enumerate(func_call_list): + tool_name = _extract_tool_name(func_call) + arguments = _extract_tool_arguments(func_call) + execution_result = ( + execution_results[index] + if index < len(execution_results) + else None + ) + + tool_inv = ExecuteToolInvocation( + tool_name=tool_name, + tool_call_id=_synth_tool_call_id( + test_entry_id, model_name, index + ), + tool_type="function", + tool_call_arguments=arguments, + tool_call_result=execution_result, + ) + + try: + with handler_obj.execute_tool(tool_inv) as inv: + span = inv.span + if span is not None and span.is_recording(): + span.set_attribute(GEN_AI_FRAMEWORK, FRAMEWORK_NAME) + span.set_attribute(BFCL_TOOL_INDEX, index) + span.set_attribute( + BFCL_TOOL_DURATION_IS_ESTIMATED, True + ) + if test_entry_id is not None: + span.set_attribute( + BFCL_TEST_ENTRY_ID, str(test_entry_id) + ) + if isinstance(execution_result, str) and execution_result.startswith( + "Error during execution:" + ): + try: + from opentelemetry.trace import ( + Status, + StatusCode, + ) + + span.set_status( + Status( + StatusCode.ERROR, + execution_result[:200], + ) + ) + except Exception: # noqa: BLE001 + pass + # Approximate latency by sleeping the budgeted slice + # would distort BFCL execution; we instead rely on + # span start/end (currently both wall-clock-now). + # The ``bfcl.tool.duration_is_estimated`` attribute + # signals the limitation to consumers. + _ = per_call_seconds # unused but documented + # Bump a per-AGENT counter for downstream debugging. + next_tool_index() + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 TOOL: span emission failed for %s", + tool_name, + exc_info=True, + ) + + return result + + +def _extract_tool_name(func_call: Any) -> str: + if not isinstance(func_call, str) or "(" not in func_call: + return "unknown" + head = func_call.split("(", 1)[0] + # ``head`` may be ``module.method`` or ``instance.method`` - keep the + # last segment which is the actual callable. + return head.split(".")[-1] or "unknown" + + +def _extract_tool_arguments(func_call: Any) -> Optional[str]: + if not isinstance(func_call, str): + return None + if "(" not in func_call or not func_call.endswith(")"): + return func_call + args_part = func_call[func_call.index("(") + 1 : -1] + return args_part if args_part else None + + +def _synth_tool_call_id( + test_entry_id: Optional[Any], model_name: Optional[Any], index: int +) -> str: + parts = [ + str(test_entry_id) if test_entry_id is not None else "no_id", + str(model_name) if model_name is not None else "no_model", + str(index), + ] + return "-".join(parts) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/package.py new file mode 100644 index 000000000..66e9fa6e1 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/package.py @@ -0,0 +1,17 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +_instruments = ("bfcl-eval >= 4.0.0",) + +_supports_metrics = False diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/utils.py new file mode 100644 index 000000000..c63bbc62b --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/utils.py @@ -0,0 +1,144 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helpers for the BFCL v4 instrumentation. + +The :class:`GenAIHookHelper` mirrors the helper used by the LoongSuite CrewAI +instrumentation: it gates ``gen_ai.input.messages`` / +``gen_ai.output.messages`` / ``gen_ai.system_instructions`` on the standard +LoongSuite content-capture environment knobs so that prompt content is not +exported by default. +""" + +from __future__ import annotations + +import dataclasses +import logging +from typing import Any, Dict, List, Optional + +from opentelemetry.semconv._incubating.attributes import gen_ai_attributes +from opentelemetry.trace import Span +from opentelemetry.util.genai.types import ( + ContentCapturingMode, + InputMessage, + MessagePart, + OutputMessage, + Text, +) +from opentelemetry.util.genai.utils import ( + gen_ai_json_dumps, + get_content_capturing_mode, + is_experimental_mode, +) + +logger = logging.getLogger(__name__) + + +class GenAIHookHelper: + """Conditionally write prompt / completion content to the span.""" + + def __init__(self, capture_content: bool = True) -> None: + self.capture_content = capture_content + + def on_completion( + self, + span: Span, + inputs: Optional[List[InputMessage]] = None, + outputs: Optional[List[OutputMessage]] = None, + system_instructions: Optional[List[MessagePart]] = None, + attributes: Optional[Dict[str, Any]] = None, + ) -> None: + if not span.is_recording(): + return + + if self.capture_content and is_experimental_mode(): + mode = get_content_capturing_mode() + should_capture_span = mode in ( + ContentCapturingMode.SPAN_ONLY, + ContentCapturingMode.SPAN_AND_EVENT, + ) + + if should_capture_span: + if inputs: + span.set_attribute( + gen_ai_attributes.GEN_AI_INPUT_MESSAGES, + gen_ai_json_dumps( + [dataclasses.asdict(i) for i in inputs] + ), + ) + if outputs: + span.set_attribute( + gen_ai_attributes.GEN_AI_OUTPUT_MESSAGES, + gen_ai_json_dumps( + [dataclasses.asdict(o) for o in outputs] + ), + ) + if system_instructions: + span.set_attribute( + gen_ai_attributes.GEN_AI_SYSTEM_INSTRUCTIONS, + gen_ai_json_dumps( + [dataclasses.asdict(s) for s in system_instructions] + ), + ) + + if attributes: + for key, value in attributes.items(): + if value is None: + continue + try: + span.set_attribute(key, value) + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4: failed to set attribute %s", key, exc_info=True + ) + + +def to_text_input(role: str, content: Any) -> List[InputMessage]: + if content in (None, "", [], {}): + return [] + text = content if isinstance(content, str) else _to_safe_str(content) + return [InputMessage(role=role, parts=[Text(content=text)])] + + +def to_text_output( + role: str, content: Any, finish_reason: str = "stop" +) -> List[OutputMessage]: + if content in (None, "", [], {}): + return [] + text = content if isinstance(content, str) else _to_safe_str(content) + return [ + OutputMessage( + role=role, parts=[Text(content=text)], finish_reason=finish_reason + ) + ] + + +def _to_safe_str(value: Any) -> str: + """Best-effort JSON serialisation, falling back to ``str()``. + + The wrapper code never wants a serialisation failure to break a span. + """ + try: + return gen_ai_json_dumps(value) + except Exception: # noqa: BLE001 + try: + return str(value) + except Exception: # noqa: BLE001 + return "" + + +def truncate_text(value: str, limit: int = 4096) -> str: + if len(value) <= limit: + return value + return value[:limit] + f"..." diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/version.py new file mode 100644 index 000000000..3263662eb --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/version.py @@ -0,0 +1,15 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__version__ = "0.1.3.dev0" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_instrumentor.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_instrumentor.py new file mode 100644 index 000000000..41446ee3b --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_instrumentor.py @@ -0,0 +1,52 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Smoke tests for ``BFCLv4Instrumentor``. + +These tests do not require ``bfcl-eval`` to be installed; they only verify +that importing the package and calling ``instrument()`` / ``uninstrument()`` +works (and degrades gracefully when ``bfcl-eval`` is missing). +""" + +import importlib + +import pytest + + +def test_import_instrumentor_package(): + module = importlib.import_module("opentelemetry.instrumentation.bfclv4") + assert hasattr(module, "BFCLv4Instrumentor") + + +def test_instrumentation_dependencies_listed(): + from opentelemetry.instrumentation.bfclv4 import BFCLv4Instrumentor + from opentelemetry.instrumentation.bfclv4.package import _instruments + + instr = BFCLv4Instrumentor() + assert tuple(instr.instrumentation_dependencies()) == _instruments + + +def test_instrument_uninstrument_no_bfcl_no_raise(): + """When ``bfcl-eval`` is missing, every wrap call logs and continues. + + The instrumentor must not raise from ``instrument()`` / + ``uninstrument()`` even if the target framework cannot be imported. + """ + + pytest.importorskip("opentelemetry.util.genai.extended_handler") + from opentelemetry.instrumentation.bfclv4 import BFCLv4Instrumentor + + instr = BFCLv4Instrumentor() + instr.instrument() + instr.uninstrument() diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_internals.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_internals.py new file mode 100644 index 000000000..21bbf6348 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_internals.py @@ -0,0 +1,113 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for the framework-agnostic helpers.""" + +import contextvars + +import pytest + + +def test_state_lifecycle(): + from opentelemetry.instrumentation.bfclv4.internal.state import ( + bump_round, + bump_turn, + get_state, + init_state, + next_tool_index, + reset_state, + ) + + token = init_state() + try: + state = get_state() + assert state == {"turn_idx": 0, "fc_round": 0, "tool_index": 0} + + assert bump_round() == 1 + assert bump_round() == 2 + assert bump_turn() == 1 + # bump_turn resets fc_round + state = get_state() + assert state["turn_idx"] == 1 + assert state["fc_round"] == 0 + assert next_tool_index() == 0 + assert next_tool_index() == 1 + finally: + reset_state(token) + + # After reset the state should be gone (None default). + assert get_state() is None + + +def test_context_propagating_executor_carries_contextvars(): + from opentelemetry.instrumentation.bfclv4.internal.threading_propagation import ( + ContextPropagatingExecutor, + ) + + cv: contextvars.ContextVar[str] = contextvars.ContextVar( + "bfclv4_test_cv", default="default" + ) + cv.set("from_main_thread") + + def _read(): + return cv.get() + + with ContextPropagatingExecutor(max_workers=2) as pool: + future = pool.submit(_read) + assert future.result() == "from_main_thread" + + +def test_extract_tool_name_and_arguments(): + from opentelemetry.instrumentation.bfclv4.internal.wrappers import ( + _extract_tool_arguments, + _extract_tool_name, + ) + + assert _extract_tool_name("calc.add(1, 2)") == "add" + assert _extract_tool_name("list_files()") == "list_files" + assert _extract_tool_name("not a call") == "unknown" + assert _extract_tool_arguments("foo(a=1, b=2)") == "a=1, b=2" + assert _extract_tool_arguments("foo()") is None + + +def test_infer_finish_reason_heuristic(): + from opentelemetry.instrumentation.bfclv4.internal.wrappers import ( + _infer_finish_reason, + ) + + assert _infer_finish_reason([]) == "empty_response" + assert _infer_finish_reason([[]]) == "empty_response" + assert _infer_finish_reason([{"name": "x"}]) == "tool_calls" + assert _infer_finish_reason("plain string") == "stop" + assert _infer_finish_reason(None) == "unknown" + + +def test_provider_mapping_without_bfcl(monkeypatch): + from opentelemetry.instrumentation.bfclv4.internal.provider import ( + infer_provider, + ) + + pytest.importorskip( + "opentelemetry.util.genai.extended_types", + ) + + class _Dummy: + model_style = None + + name, extras = infer_provider(_Dummy()) + # If bfcl-eval is not installed, ``ModelStyle`` import fails and we get + # ``unknown``; otherwise we still get ``unknown`` because ``model_style`` + # is None. + assert name == "unknown" + assert extras == {} diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/README.rst b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/README.rst new file mode 100644 index 000000000..8f4dc9a8b --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/README.rst @@ -0,0 +1,90 @@ +OpenTelemetry OpenHands Instrumentation +======================================== + +Automatic OpenTelemetry instrumentation for the legacy OpenHands V0 / +CodeAct runtime. + +What is covered +--------------- + +This package wraps the V0 ``python -m openhands.core.main`` execution path: + +* ``openhands.core.main.run_controller`` for the ENTRY span. +* ``openhands.core.loop.run_agent_until_done`` for the AGENT span fallback. +* ``AgentController.__init__`` / ``AgentController.close`` for lifecycle-bound + ENTRY and AGENT spans that survive ``python -m`` from-import binding. +* ``AgentController._step`` for ReAct STEP spans. +* ``Runtime.run_action`` for TOOL spans. +* ``LLM.__init__`` to bridge the current OpenHands context into LiteLLM calls. + +Span tree +--------- + +:: + + ENTRY enter openhands + `-- AGENT invoke_agent codeact + |-- STEP react step [xN] + | |-- LLM chat {model} + | `-- TOOL execute_tool {tool_name} + `-- STEP react step [...] + +``python -m`` and from-import binding +------------------------------------- + +When OpenHands V0 is launched via ``python -m openhands.core.main``, Python +executes ``main.py`` as ``__main__``. Symbols imported with ``from ... import`` +can be bound before module-level wrappers are installed, so patching +``openhands.core.main.run_controller`` is not enough by itself. + +To keep ENTRY and AGENT spans reliable, this instrumentation primarily opens +them from ``AgentController.__init__`` and closes them from +``AgentController.close``. The module-level wrappers remain as a fallback for +programmatic invocations. + +Cross-thread context bridge +--------------------------- + +OpenHands V0 may execute controller steps and runtime tool calls in worker +threads with fresh asyncio loops. The instrumentation stores the active OTel +context by session id and re-attaches it in STEP, TOOL, and LLM bridge wrappers +so the trace remains: + +``ENTRY -> AGENT -> STEP -> (LLM / TOOL)``. + +Semantic-convention I/O capture +------------------------------- + +ENTRY, AGENT, STEP, and TOOL spans emit ``input.value`` / ``output.value`` and +GenAI semantic attributes where applicable. + +* **ENTRY** emits ``gen_ai.input.messages`` and ``gen_ai.output.messages`` using + the ARMS parts-based message schema. +* **AGENT** emits ``gen_ai.input.messages``, ``gen_ai.output.messages``, + ``gen_ai.system_instructions`` / ``gen_ai.system_instruction``, and + ``gen_ai.tool.definitions``. +* **STEP** emits recent input history and the pending assistant/tool-call + output for the ReAct round. +* **TOOL** emits ``gen_ai.tool.name``, ``gen_ai.tool.type``, + ``gen_ai.tool.call.id``, ``gen_ai.tool.description``, + ``gen_ai.tool.call.arguments``, and ``gen_ai.tool.call.result``. + +Usage +----- + +.. code:: python + + from opentelemetry.instrumentation.openhands import OpenHandsInstrumentor + + OpenHandsInstrumentor().instrument() + +Configuration +------------- + +Environment variables: + +* ``OTEL_INSTRUMENTATION_OPENHANDS_ENABLED`` (default ``true``) +* ``OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS`` (default ``true``) +* ``OTEL_INSTRUMENTATION_OPENHANDS_AUTO_INSTRUMENT_LITELLM`` (default ``true``) + +I/O capture is always on and content is emitted in full. diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/pyproject.toml new file mode 100644 index 000000000..b9f0ae7f4 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/pyproject.toml @@ -0,0 +1,50 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "loongsuite-instrumentation-openhands" +dynamic = ["version"] +description = "LoongSuite OpenHands Instrumentation" +readme = "README.rst" +license = "Apache-2.0" +requires-python = ">=3.10" +authors = [ + { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] +dependencies = [ + "wrapt >= 1.0.0, < 2.0.0", +] + +[project.optional-dependencies] +instruments = [] + +[project.entry-points.opentelemetry_instrumentor] +openhands = "opentelemetry.instrumentation.openhands:OpenHandsInstrumentor" + +[project.urls] +Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-openhands" +Repository = "https://github.com/alibaba/loongsuite-python-agent" + +[tool.hatch.version] +path = "src/opentelemetry/instrumentation/openhands/version.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/opentelemetry"] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/__init__.py new file mode 100644 index 000000000..a02a7d3b3 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/__init__.py @@ -0,0 +1,265 @@ +"""OpenTelemetry OpenHands Instrumentation. + +Wraps the legacy V0 (CodeAct + AgentController + Runtime) path: + +* V0 — ``python -m openhands.core.main``. We add + ``ENTRY → AGENT → STEP → TOOL`` directly on top of the controller / runtime + call chain. LLM spans come from the bundled LiteLLM instrumentor. + +Usage +----- + +.. code:: python + + from opentelemetry.instrumentation.openhands import OpenHandsInstrumentor + + OpenHandsInstrumentor().instrument() +""" + +from __future__ import annotations + +import importlib +import logging +from typing import Any, Collection + +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor +from wrapt import wrap_function_wrapper + +from opentelemetry.instrumentation.openhands.config import ( + OTEL_INSTRUMENTATION_OPENHANDS_AUTO_INSTRUMENT_LITELLM, + OTEL_INSTRUMENTATION_OPENHANDS_ENABLED, + OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS, +) +from opentelemetry.instrumentation.openhands.package import _instruments +from opentelemetry.instrumentation.openhands.version import __version__ + +logger = logging.getLogger(__name__) + +__all__ = ["OpenHandsInstrumentor"] + + +# --------------------------------------------------------------------------- +# Wrap-point registry — single source of truth shared with _uninstrument. +# Entries: (module, qualified_name) +# --------------------------------------------------------------------------- + +_PATCH_TARGETS: list[tuple[str, str]] = [ + ("openhands.core.main", "run_controller"), + ("openhands.core.loop", "run_agent_until_done"), + # AgentController.__init__ / .close are the *primary* ENTRY+AGENT + # span source for V0 — they're class methods, so they're patchable + # regardless of the from-import binding problem in main.py + # (see v0_wrappers.AgentControllerInitWrapper docstring). + ( + "openhands.controller.agent_controller", + "AgentController.__init__", + ), + ( + "openhands.controller.agent_controller", + "AgentController.close", + ), + ( + "openhands.controller.agent_controller", + "AgentController._step", + ), + ("openhands.runtime.base", "Runtime.run_action"), + # LLM context bridge — re-attaches the current sid-stashed context + # (STEP while a step is open) onto every ``LLM.completion`` invocation + # so the downstream LiteLLM / Aliyun GenAI auto-instrumentation emits + # the LLM span as a child of STEP and shares its ``trace_id``. + ("openhands.llm.llm", "LLM.__init__"), +] + + +def _module_importable(module: str) -> bool: + try: + importlib.import_module(module) + return True + except ModuleNotFoundError: + return False + except Exception: + # Other import errors should still let the wrap attempt surface a + # warning. + return True + + +def _safe_wrap(module: str, name: str, wrapper: Any) -> bool: + """Patch ``module.name`` with ``wrapper``; classify failures sensibly.""" + if not _module_importable(module): + # OpenHands versions can move modules around. Missing V0 modules + # should not prevent applications from starting. + logger.debug( + "OpenHands instrumentation: module %s not importable, skipping %s", + module, + name, + ) + return False + try: + wrap_function_wrapper(module=module, name=name, wrapper=wrapper) + logger.debug("OpenHands instrumentation: wrapped %s.%s", module, name) + return True + except (AttributeError, ImportError) as exc: + # Attribute missing inside the module — usually a version-skew issue. + logger.warning( + "OpenHands instrumentation: could not wrap %s.%s: %s", + module, + name, + exc, + ) + return False + except Exception as exc: # pragma: no cover - defensive + logger.warning( + "OpenHands instrumentation: unexpected error wrapping %s.%s: %s", + module, + name, + exc, + ) + return False + + +def _safe_unwrap(module: str, qualname: str) -> None: + """Unwrap a previously ``wrapt``-patched function or method.""" + try: + mod = importlib.import_module(module) + except Exception: + return + parts = qualname.split(".") + obj: Any = mod + parents: list[Any] = [mod] + try: + for p in parts: + obj = getattr(obj, p) + parents.append(obj) + except Exception: + return + if not hasattr(obj, "__wrapped__"): + return + parent = parents[-2] + try: + setattr(parent, parts[-1], obj.__wrapped__) + except Exception: + pass + + +class OpenHandsInstrumentor(BaseInstrumentor): + """Instrumentation entry point for OpenHands V0.""" + + def instrumentation_dependencies(self) -> Collection[str]: + return _instruments + + def _instrument(self, **kwargs: Any) -> None: + if not OTEL_INSTRUMENTATION_OPENHANDS_ENABLED: + logger.info("OpenHands instrumentation disabled via env var") + return + + tracer_provider = kwargs.get("tracer_provider") + tracer = trace_api.get_tracer( + __name__, __version__, tracer_provider=tracer_provider + ) + + from opentelemetry.instrumentation.openhands.internal.v0_wrappers import ( + AgentControllerCloseWrapper, + AgentControllerInitWrapper, + AgentControllerStepWrapper, + LLMInitWrapper, + RunAgentUntilDoneWrapper, + RunControllerWrapper, + RuntimeRunActionWrapper, + ) + + if OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS: + self._install_v0_patches(tracer, { + "run_controller": RunControllerWrapper, + "run_agent_until_done": RunAgentUntilDoneWrapper, + "agent_init": AgentControllerInitWrapper, + "agent_close": AgentControllerCloseWrapper, + "agent_step": AgentControllerStepWrapper, + "runtime_run_action": RuntimeRunActionWrapper, + "llm_init": LLMInitWrapper, + }) + + # Auto-enable bundled LiteLLM instrumentation so SDK / V0 LLM + # ``litellm.completion()`` calls become LLM spans. + if OTEL_INSTRUMENTATION_OPENHANDS_AUTO_INSTRUMENT_LITELLM: + self._maybe_enable_litellm(**kwargs) + + def _install_v0_patches(self, tracer, factories) -> None: + RunControllerWrapper = factories["run_controller"] + RunAgentUntilDoneWrapper = factories["run_agent_until_done"] + AgentControllerInitWrapper = factories["agent_init"] + AgentControllerCloseWrapper = factories["agent_close"] + AgentControllerStepWrapper = factories["agent_step"] + RuntimeRunActionWrapper = factories["runtime_run_action"] + LLMInitWrapper = factories["llm_init"] + + # `run_controller` and `run_agent_until_done` patches are best-effort: + # they only fire when run_controller is called via the proper module + # path (programmatic / test). When OpenHands is launched via + # ``python -m openhands.core.main``, the from-import binding in + # main.py bypasses these patches — the AgentController.__init__ / + # .close patches below take over and produce ENTRY+AGENT spans + # reliably (class methods are immune to from-import binding). + _safe_wrap( + "openhands.core.main", + "run_controller", + RunControllerWrapper(tracer), + ) + _safe_wrap( + "openhands.core.loop", + "run_agent_until_done", + RunAgentUntilDoneWrapper(tracer), + ) + _safe_wrap( + "openhands.controller.agent_controller", + "AgentController.__init__", + AgentControllerInitWrapper(tracer), + ) + _safe_wrap( + "openhands.controller.agent_controller", + "AgentController.close", + AgentControllerCloseWrapper(tracer), + ) + _safe_wrap( + "openhands.controller.agent_controller", + "AgentController._step", + AgentControllerStepWrapper(tracer), + ) + _safe_wrap( + "openhands.runtime.base", + "Runtime.run_action", + RuntimeRunActionWrapper(tracer), + ) + # LLM context bridge — patches ``LLM.__init__`` so every instance's + # ``self._completion`` re-attaches the latest sid-stashed context. + # See ``LLMInitWrapper`` for why we need this even though the LLM + # call is synchronous: in real OpenHands deployments LiteLLM ends + # up creating its span in a thread / context that ``contextvars`` + # didn't propagate STEP into, so we re-attach explicitly. + _safe_wrap( + "openhands.llm.llm", + "LLM.__init__", + LLMInitWrapper(tracer), + ) + + def _maybe_enable_litellm(self, **kwargs: Any) -> None: + try: + from opentelemetry.instrumentation.litellm import ( + LiteLLMInstrumentor, + ) + except Exception as exc: + logger.debug( + "LiteLLM instrumentation not available, skipping: %s", exc + ) + return + try: + instr = LiteLLMInstrumentor() + already = getattr(instr, "_is_instrumented_by_opentelemetry", False) + if not already: + instr.instrument(**kwargs) + except Exception as exc: + logger.debug("Could not auto-enable LiteLLM instrumentation: %s", exc) + + def _uninstrument(self, **kwargs: Any) -> None: + for module, qualname in _PATCH_TARGETS: + _safe_unwrap(module, qualname) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/config.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/config.py new file mode 100644 index 000000000..4f5ad38db --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/config.py @@ -0,0 +1,25 @@ +"""Environment-variable driven configuration for the OpenHands instrumentation.""" + +from __future__ import annotations + +import os + + +def _bool_env(name: str, default: bool) -> bool: + val = os.getenv(name) + if val is None: + return default + return val.strip().lower() in {"true", "1", "yes", "on"} + + +OTEL_INSTRUMENTATION_OPENHANDS_ENABLED = _bool_env( + "OTEL_INSTRUMENTATION_OPENHANDS_ENABLED", True +) + +OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS = _bool_env( + "OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS", True +) + +OTEL_INSTRUMENTATION_OPENHANDS_AUTO_INSTRUMENT_LITELLM = _bool_env( + "OTEL_INSTRUMENTATION_OPENHANDS_AUTO_INSTRUMENT_LITELLM", True +) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/__init__.py new file mode 100644 index 000000000..7b2c8b6a1 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/__init__.py @@ -0,0 +1 @@ +"""Internal helpers for OpenHands instrumentation.""" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/constants.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/constants.py new file mode 100644 index 000000000..6d99a6820 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/constants.py @@ -0,0 +1,12 @@ +"""Constant attribute keys & framework identity used across wrappers.""" + +from __future__ import annotations + +GEN_AI_FRAMEWORK = "gen_ai.framework" +GEN_AI_SPAN_KIND = "gen_ai.span.kind" + +FRAMEWORK_NAME = "openhands" + +# OpenHands-specific span attributes (namespaced to avoid clashing with the +# generic GenAI semconv attributes already provided by upstream). +OH_INITIAL_MESSAGE_PREVIEW = "openhands.initial_message.preview" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/session_context.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/session_context.py new file mode 100644 index 000000000..534d3e611 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/session_context.py @@ -0,0 +1,196 @@ +"""Cross-thread / cross-loop OTel context bridge keyed by OpenHands session id. + +Why this exists +--------------- + +OpenHands V0's ``EventStream`` delivers events to subscribers via a +``ThreadPoolExecutor``. The ``AgentController.on_event`` callback then runs + +.. code:: python + + asyncio.get_event_loop().run_until_complete(self._on_event(event)) + +inside a *worker thread*, which spins up a brand-new asyncio loop with a +fresh ``contextvars.Context``. This means none of the OTel context (tracer +spans / baggage) attached on the main coroutine in ``run_controller`` is +visible inside ``AgentController._step`` or ``Runtime.run_action`` — every +STEP / TOOL span starts at the **trace root**, fragmenting the trace into +many disconnected pieces. + +This module bridges that gap. We snapshot the OTel context at entry-time +(``run_controller`` / ``run_agent_until_done``) under the controller's +session id, and the STEP / TOOL wrappers re-attach the snapshot before +opening their spans so every span shares a single ``trace_id`` rooted at +the ENTRY span. + +The store is keyed by **session id (sid)** so concurrent benchmark +sessions stay isolated. +""" + +from __future__ import annotations + +import threading +from typing import Optional + +from opentelemetry import context as otel_context + +_lock = threading.Lock() +# Map session id -> OTel Context object. The Context contains the active +# Span (and any baggage / suppression flags). Re-attaching it makes the +# stored span the *current* span for whatever thread/loop attaches it. +_session_contexts: dict[str, otel_context.Context] = {} + +# Map session id -> { tool_name: tool_definition_dict }. Captured at +# AGENT span open from ``controller.agent.tools`` and consumed by the +# TOOL wrapper to populate ``gen_ai.tool.description`` and friends — the +# Runtime instance does not have direct access to the agent's tool list. +_session_tool_registry: dict[str, dict[str, dict]] = {} + +# Tracks the most-recent sid we stored a context for. Used as a fallback +# when a hook point (typically ``Runtime.run_action``) cannot locate the +# session id from its arguments — in single-session CLI runs this is +# always the right answer. +_last_sid: Optional[str] = None + + +def store_context(sid: Optional[str], ctx: otel_context.Context) -> None: + """Stash ``ctx`` under ``sid``. Updates ``_last_sid``.""" + if not sid: + return + global _last_sid + with _lock: + _session_contexts[sid] = ctx + _last_sid = sid + + +def get_context(sid: Optional[str]) -> Optional[otel_context.Context]: + """Return the stashed context for ``sid``, falling back to the last sid.""" + with _lock: + if sid and sid in _session_contexts: + return _session_contexts[sid] + if _last_sid and _last_sid in _session_contexts: + return _session_contexts[_last_sid] + return None + + +def clear_context(sid: Optional[str]) -> None: + if not sid: + return + global _last_sid + with _lock: + _session_contexts.pop(sid, None) + _session_tool_registry.pop(sid, None) + if _last_sid == sid: + _last_sid = None + + +def clear_all() -> None: + """Drop everything (only used by tests).""" + global _last_sid + with _lock: + _session_contexts.clear() + _session_tool_registry.clear() + _last_sid = None + + +# --------------------------------------------------------------------------- +# Tool registry (per-sid) +# --------------------------------------------------------------------------- + + +def store_tool_registry(sid: Optional[str], tools: object) -> None: + """Index ``tools`` by name and stash under ``sid``. + + ``tools`` is whatever ``controller.agent.tools`` exposes — typically a + list of LiteLLM ``ChatCompletionToolParam`` dicts of the form + ``{"type": "function", "function": {"name": ..., "description": ..., ...}}``. + Anything that doesn't fit that shape is best-effort skipped. + """ + if not sid or not tools: + return + registry: dict[str, dict] = {} + try: + for t in tools: # type: ignore[union-attr] + try: + if isinstance(t, dict): + fn = t.get("function") or {} + name = fn.get("name") if isinstance(fn, dict) else None + else: + fn = getattr(t, "function", None) + name = getattr(fn, "name", None) if fn is not None else None + # Normalize to a dict so the consumer doesn't need type-knowledge. + if name and not isinstance(t, dict): + t = { + "type": getattr(t, "type", "function"), + "function": { + "name": name, + "description": getattr(fn, "description", "") or "", + "parameters": getattr(fn, "parameters", None) or {}, + }, + } + if name: + registry[str(name)] = t + except Exception: + continue + except TypeError: + return + if not registry: + return + with _lock: + _session_tool_registry[sid] = registry + + +def get_tool_definition(sid: Optional[str], name: Optional[str]) -> Optional[dict]: + """Look up a single tool's definition (dict) by name, sid-scoped.""" + if not name: + return None + with _lock: + if sid and sid in _session_tool_registry: + return _session_tool_registry[sid].get(name) + # Fallback to the most-recent session — single-CLI-run case. + if _last_sid and _last_sid in _session_tool_registry: + return _session_tool_registry[_last_sid].get(name) + return None + + +def get_tool_registry(sid: Optional[str]) -> Optional[dict[str, dict]]: + """Return the full ``{name: definition}`` registry for ``sid``.""" + with _lock: + if sid and sid in _session_tool_registry: + return dict(_session_tool_registry[sid]) + if _last_sid and _last_sid in _session_tool_registry: + return dict(_session_tool_registry[_last_sid]) + return None + + +class AttachedSession: + """Context manager that attaches the stashed context for ``sid``. + + Usage:: + + with AttachedSession(sid): + span = tracer.start_span(...) + # span is parented under whatever the stashed context contains + + No-op when no stash exists for the given sid. + """ + + __slots__ = ("_sid", "_token") + + def __init__(self, sid: Optional[str]): + self._sid = sid + self._token = None + + def __enter__(self) -> "AttachedSession": + ctx = get_context(self._sid) + if ctx is not None: + self._token = otel_context.attach(ctx) + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + if self._token is not None: + try: + otel_context.detach(self._token) + except Exception: + pass + self._token = None diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/utils.py new file mode 100644 index 000000000..7354bb8b2 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/utils.py @@ -0,0 +1,190 @@ +"""Small attribute / argument extraction helpers shared by the wrappers.""" + +from __future__ import annotations + +import json +from typing import Any + + +def safe_str(value: Any) -> str: + """Best-effort string conversion that never raises.""" + if value is None: + return "" + try: + return str(value) + except Exception: + return "" + + +def preview(text: Any, max_len: int | None = None) -> str: + """Return a string preview of *text* (kept for API compatibility). + + Truncation is no longer applied — captured content is emitted in + full so dashboards never lose information. ``max_len`` is accepted + but ignored. + """ + return safe_str(text) + + +def maybe_preview(text: Any) -> str: + """Alias for :func:`preview` — kept for API compatibility.""" + return preview(text) + + +def safe_get_attr(obj: Any, *names: str, default: Any = None) -> Any: + """Return the first non-None attribute among *names* on *obj*.""" + for name in names: + if obj is None: + return default + try: + v = getattr(obj, name, None) + except Exception: + v = None + if v is not None: + return v + return default + + +def serialize_message(message: Any) -> str: + """Best-effort serialize an OpenHands message-like object to text.""" + if message is None: + return "" + if isinstance(message, str): + return message + text_parts: list[str] = [] + for attr in ("text", "content", "value"): + v = safe_get_attr(message, attr) + if isinstance(v, str) and v: + return v + if isinstance(v, list): + for item in v: + t = safe_get_attr(item, "text", "content") + if isinstance(t, str) and t: + text_parts.append(t) + if text_parts: + return "\n".join(text_parts) + return safe_str(message) + + +def extract_uuid_str(value: Any) -> str: + """Convert a UUID-like value to its hex/string form, returning ''.""" + if value is None: + return "" + hex_attr = getattr(value, "hex", None) + if isinstance(hex_attr, str) and hex_attr: + return hex_attr + return safe_str(value) + + +# --------------------------------------------------------------------------- +# Semconv I/O serialization (input.value / output.value) +# --------------------------------------------------------------------------- + + +def _to_jsonable(obj: Any, depth: int = 0, max_depth: int = 3) -> Any: + """Best-effort convert ``obj`` into something json.dumps can serialize.""" + if obj is None or isinstance(obj, (bool, int, float, str)): + return obj + if depth >= max_depth: + return safe_str(obj) + if isinstance(obj, dict): + out: dict[str, Any] = {} + for k, v in obj.items(): + try: + out[safe_str(k)] = _to_jsonable(v, depth + 1, max_depth) + except Exception: + out[safe_str(k)] = safe_str(v) + return out + if isinstance(obj, (list, tuple, set)): + return [_to_jsonable(v, depth + 1, max_depth) for v in obj] + # Pydantic v2 + if hasattr(obj, "model_dump"): + try: + return _to_jsonable(obj.model_dump(), depth + 1, max_depth) + except Exception: + pass + # Dataclass / generic object + if hasattr(obj, "__dict__"): + try: + d = { + k: v + for k, v in vars(obj).items() + if not k.startswith("_") + and not callable(v) + } + if d: + return _to_jsonable(d, depth + 1, max_depth) + except Exception: + pass + return safe_str(obj) + + +def to_json_str(obj: Any, max_len: int | None = None) -> str: + """Convert ``obj`` to a JSON string. Empty string on failure. + + No truncation is applied — captured content is emitted in full. + ``max_len`` is accepted but ignored (kept for API compatibility). + """ + try: + jsonable = _to_jsonable(obj) + s = json.dumps(jsonable, ensure_ascii=False, default=safe_str) + except Exception: + s = safe_str(obj) + return s or "" + + +def maybe_to_json_str(obj: Any, max_len: int | None = None) -> str: + """Alias for :func:`to_json_str` — kept for API compatibility.""" + return to_json_str(obj, max_len) + + +def messages_to_genai_input(messages: Any) -> str: + """Serialize a chat-style ``messages`` list for ``gen_ai.input.messages``. + + Each item is normalized into ``{"role": ..., "content": ...}``. Keeps + ``tool_calls`` when present. + """ + if not isinstance(messages, list): + return "" + norm: list[dict[str, Any]] = [] + for m in messages: + role = safe_get_attr(m, "role") + content = safe_get_attr(m, "content") + if role is None and content is None and isinstance(m, dict): + role = m.get("role") + content = m.get("content") + if isinstance(content, list): + content = "".join( + safe_str(safe_get_attr(c, "text") or safe_get_attr(c, "content") or c) + for c in content + ) + item: dict[str, Any] = {"role": safe_str(role) or "user", "content": safe_str(content)} + tool_calls = safe_get_attr(m, "tool_calls") + if tool_calls: + item["tool_calls"] = _to_jsonable(tool_calls) + norm.append(item) + return to_json_str(norm) + + +def action_to_genai_output(action: Any) -> str: + """Serialize an OpenHands V0 ``Action`` into a GenAI-style assistant message.""" + if action is None: + return "" + action_type = safe_str(safe_get_attr(action, "action") or "") + thought = safe_str(safe_get_attr(action, "thought") or "") + item: dict[str, Any] = {"role": "assistant"} + if thought: + item["content"] = thought + args: dict[str, Any] = {} + for key in ("command", "code", "path", "url", "content", "task_list", "name", "arguments"): + v = safe_get_attr(action, key) + if v not in (None, "", []): + args[key] = _to_jsonable(v) + if action_type or args: + item["tool_calls"] = [ + { + "type": "function", + "function": {"name": action_type or "agent.action", "arguments": args}, + } + ] + return to_json_str([item]) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/v0_wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/v0_wrappers.py new file mode 100644 index 000000000..614672ed5 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/v0_wrappers.py @@ -0,0 +1,2535 @@ +"""Wrappers for the OpenHands **V0** (Legacy CodeAct) architecture. + +Trace tree +---------- + +:: + + ENTRY enter openhands (openhands.core.main.run_controller) + `-- AGENT invoke_agent codeact (openhands.core.loop.run_agent_until_done) + |-- STEP react step [×N] (openhands.controller.agent_controller.AgentController._step) + | `-- LLM chat {model} (litellm — covered by litellm instrumentor) + `-- TOOL execute_tool {tool_name} (openhands.runtime.base.Runtime.run_action) + +Context propagation across threads +---------------------------------- + +OpenHands V0's ``EventStream`` delivers events via ``ThreadPoolExecutor``, +and ``AgentController.on_event`` then runs the actual handler with a +*brand-new* asyncio loop in a worker thread: + +.. code:: python + + asyncio.get_event_loop().run_until_complete(self._on_event(event)) + +Python ``contextvars`` do NOT propagate from the main coroutine into these +worker threads, so ``AgentController._step`` and ``Runtime.run_action`` +would otherwise start *root* spans with fresh ``trace_id``s, fragmenting +the trace into many disconnected pieces. + +To fix that, we use :mod:`session_context` as a process-wide bridge: the +ENTRY wrapper stashes the OTel context (carrying the ENTRY+AGENT span +chain) keyed by session id, and STEP / TOOL wrappers re-attach it before +opening their span. The result is one trace per session id with the +correct parent-child links. + +I/O capture +----------- + +ENTRY / AGENT / STEP / TOOL spans all set: + +* ``input.value`` and ``output.value`` (OpenInference convention) +* ``input.mime_type`` / ``output.mime_type`` +* ``gen_ai.input.messages`` / ``gen_ai.output.messages`` where the GenAI + semconv applies (LLM-style messages + assistant tool calls) + +Capture is always on and content is emitted untruncated. +""" + +from __future__ import annotations + +import logging +from typing import Any + +from opentelemetry import context as otel_context +from opentelemetry import trace as trace_api +from opentelemetry.semconv._incubating.attributes import ( + gen_ai_attributes as GenAI, +) +from opentelemetry.trace import ( + SpanKind, + Status, + StatusCode, + Tracer, + set_span_in_context, +) + +from opentelemetry.instrumentation.openhands.config import ( + OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS, +) +from opentelemetry.instrumentation.openhands.internal.constants import ( + FRAMEWORK_NAME, + GEN_AI_FRAMEWORK, + GEN_AI_SPAN_KIND, + OH_INITIAL_MESSAGE_PREVIEW, +) +from opentelemetry.instrumentation.openhands.internal.session_context import ( + AttachedSession, + clear_context, + get_context, + get_tool_definition, + store_context, + store_tool_registry, +) +from opentelemetry.instrumentation.openhands.internal.utils import ( + action_to_genai_output, + maybe_preview, + maybe_to_json_str, + messages_to_genai_input, + safe_get_attr, + safe_str, + serialize_message, + to_json_str, +) + +logger = logging.getLogger(__name__) + + +# Constants ----------------------------------------------------------------- + +OH_AGENT_NAME = "openhands.agent.name" +OH_REACT_ROUND = "gen_ai.react.round" +OH_AGENT_STATE = "openhands.agent.state" +OH_RUNTIME_NAME = "openhands.runtime.name" +OH_ACTION_TYPE = "openhands.action.type" +OH_OBSERVATION_TYPE = "openhands.observation.type" +OH_HISTORY_LENGTH = "openhands.history.length" + +# OpenInference / GenAI common I/O attribute keys +INPUT_VALUE = "input.value" +INPUT_MIME = "input.mime_type" +OUTPUT_VALUE = "output.value" +OUTPUT_MIME = "output.mime_type" +GEN_AI_INPUT_MESSAGES = "gen_ai.input.messages" +GEN_AI_OUTPUT_MESSAGES = "gen_ai.output.messages" +GEN_AI_SYSTEM = "gen_ai.system" +GEN_AI_AGENT_ID = "gen_ai.agent.id" +GEN_AI_CONVERSATION_ID = "gen_ai.conversation.id" +GEN_AI_SESSION_ID = "gen_ai.session.id" +GEN_AI_REQUEST_MODEL = "gen_ai.request.model" +GEN_AI_SYSTEM_INSTRUCTIONS = "gen_ai.system_instructions" +GEN_AI_SYSTEM_INSTRUCTION = "gen_ai.system_instruction" + +# Tool span attributes per ARMS GenAI semconv (gen-ai.md §Tool). +GEN_AI_TOOL_CALL_ID = "gen_ai.tool.call.id" +GEN_AI_TOOL_CALL_ARGUMENTS = "gen_ai.tool.call.arguments" +GEN_AI_TOOL_CALL_RESULT = "gen_ai.tool.call.result" +GEN_AI_TOOL_DESCRIPTION = "gen_ai.tool.description" +GEN_AI_TOOL_DEFINITIONS = "gen_ai.tool.definitions" + +# Stash slots on AgentController instances (set by AgentControllerInitWrapper). +_OWNS_FLAG = "_otel_oh_owns_lifecycle" +_ENTRY_SPAN_ATTR = "_otel_oh_entry_span" +_AGENT_SPAN_ATTR = "_otel_oh_agent_span" +_ENTRY_TOKEN_ATTR = "_otel_oh_entry_token" +_AGENT_TOKEN_ATTR = "_otel_oh_agent_token" +# STEP persistence — keeps the *most-recent* STEP span alive across the +# return of ``_step`` so that ``Runtime.run_action`` (which fires *later* +# in a thread-pool executor via ``call_sync_from_async``) can re-attach +# the STEP context and become its child rather than a sibling. +# +# IMPORTANT: we deliberately do **not** stash an OTel attach-token across +# the return of ``_step``. ``otel_context.attach()`` returns a Token that +# is bound to the ``contextvars.Context`` it was created in; calling +# ``detach(token)`` from a *different* context raises ``ValueError`` (and +# in production the Aliyun OTel SDK floods the log with +# "Token was created in a different Context" errors). Attach/detach +# always happen as a balanced pair *inside the same async task*; cross- +# task / cross-thread propagation goes through the ``Context`` *object* +# stashed in :mod:`session_context` and re-attached on the consumer side. +_STEP_SPAN_ATTR = "_otel_oh_step_span" +_AGENT_CTX_ATTR = "_otel_oh_agent_ctx" # restore target when STEP closes + + +def _set_common(span: trace_api.Span, kind: str) -> None: + span.set_attribute(GEN_AI_SPAN_KIND, kind) + span.set_attribute(GEN_AI_FRAMEWORK, FRAMEWORK_NAME) + span.set_attribute(GEN_AI_SYSTEM, FRAMEWORK_NAME) + + +def _set_io( + span: trace_api.Span, + *, + input_value: str = "", + output_value: str = "", + input_messages: str = "", + output_messages: str = "", + mime: str = "application/json", +) -> None: + if input_value: + span.set_attribute(INPUT_VALUE, input_value) + span.set_attribute(INPUT_MIME, mime) + if output_value: + span.set_attribute(OUTPUT_VALUE, output_value) + span.set_attribute(OUTPUT_MIME, mime) + if input_messages: + span.set_attribute(GEN_AI_INPUT_MESSAGES, input_messages) + if output_messages: + span.set_attribute(GEN_AI_OUTPUT_MESSAGES, output_messages) + + +def _extract_model_from_config(config: Any) -> str: + if config is None: + return "" + try: + llms = safe_get_attr(config, "llms") + if isinstance(llms, dict) and llms: + llm = next(iter(llms.values())) + model = safe_get_attr(llm, "model") + if model: + return safe_str(model) + except Exception: + pass + try: + llm = safe_get_attr(config, "llm") + model = safe_get_attr(llm, "model") + if model: + return safe_str(model) + except Exception: + pass + return "" + + +def _extract_input_message_text(initial_user_action: Any) -> str: + """Pull human-readable text out of an ``initial_user_action`` argument.""" + return serialize_message(initial_user_action) + + +def _state_to_input_messages(state: Any, max_messages: int = 10) -> str: + """Best-effort extract a chat-style messages list from a controller State. + + The actual messages sent to the LLM are built inside ``CodeActAgent.step`` + and not stored on the controller, so this is a coarse summary derived + from ``state.history`` which is reliably available. + """ + history = safe_get_attr(state, "history") or [] + if not isinstance(history, list): + return "" + items: list[dict[str, str]] = [] + # Keep the most recent ``max_messages`` events for size budget. + for ev in history[-max_messages:]: + cls_name = type(ev).__name__ + # Map common event types to roles + if cls_name in ("MessageAction", "SystemMessageAction"): + role = "user" if str(safe_get_attr(ev, "source")) == "user" else "assistant" + content = safe_get_attr(ev, "content") or safe_get_attr(ev, "message") or "" + elif cls_name.endswith("Action"): + role = "assistant" + content = ( + safe_get_attr(ev, "thought") + or safe_get_attr(ev, "command") + or safe_get_attr(ev, "code") + or safe_str(ev) + ) + elif cls_name.endswith("Observation"): + role = "tool" + content = safe_get_attr(ev, "content") or safe_str(ev) + else: + role = "system" + content = safe_str(ev) + items.append({"role": role, "content": safe_str(content), "event": cls_name}) + return to_json_str(items) + + +def _final_state_to_output(state: Any) -> str: + """Serialize the controller's final state for output.value.""" + if state is None: + return "" + payload: dict[str, Any] = {} + agent_state = safe_get_attr(state, "agent_state") + if agent_state is not None: + payload["agent_state"] = ( + safe_get_attr(agent_state, "value") or safe_str(agent_state) + ) + last_error = safe_get_attr(state, "last_error") + if last_error: + payload["last_error"] = safe_str(last_error) + iteration = safe_get_attr(state, "iteration") + if iteration is not None: + payload["iteration"] = safe_str(iteration) + history = safe_get_attr(state, "history") or [] + if isinstance(history, list) and history: + payload["history_length"] = len(history) + # Find the last AgentFinishAction or last assistant content for a final answer summary. + for ev in reversed(history): + if type(ev).__name__ == "AgentFinishAction": + payload["final_thought"] = safe_str( + safe_get_attr(ev, "final_thought") + or safe_get_attr(ev, "thought") + or "" + ) + payload["outputs"] = safe_str(safe_get_attr(ev, "outputs") or {}) + break + return to_json_str(payload) + + +def _entry_input_messages_from_initial(initial_user_action: Any) -> str: + """Return ARMS gen_ai.input.messages for the ENTRY span.""" + text = _extract_input_message_text(initial_user_action) + if not text: + return "" + return to_json_str( + [{"role": "user", "parts": [{"type": "text", "content": text}]}] + ) + + +def _entry_io_from_state(state: Any) -> tuple[str, str]: + """Return (input_messages, output_messages) for ENTRY from final state.""" + history = safe_get_attr(state, "history") or [] + input_messages = "" + output_messages = "" + if isinstance(history, list) and history: + input_payload = _history_to_input_messages_schema(history) + if input_payload: + input_messages = to_json_str(input_payload) + output_payload = _history_to_output_messages_schema(history) + if output_payload: + output_messages = to_json_str(output_payload) + if not output_messages: + final_state = _final_state_to_output(state) + if final_state: + output_messages = to_json_str( + [ + { + "role": "assistant", + "parts": [{"type": "text", "content": final_state}], + "finish_reason": "stop", + } + ] + ) + return input_messages, output_messages + + +# --------------------------------------------------------------------------- +# ARMS GenAI semconv message-schema converters. +# +# Per gen-ai.md §LLM/§Agent, gen_ai.input.messages / gen_ai.output.messages +# / gen_ai.system_instructions follow a "parts"-based structure: +# +# [{"role": "user|assistant|tool|system", +# "parts": [{"type": "text|tool_call|tool_call_response|...", +# "content": "...", "name": "...", "id": "...", +# "arguments": {...}, "result": "..."}], +# "finish_reason": "stop|...", # output only +# }] +# +# The system instructions schema is a flat list of parts: +# +# [{"type": "text", "content": "..."}] +# --------------------------------------------------------------------------- + + +def _action_event_to_parts(ev: Any) -> list[dict[str, Any]]: + """Convert an Action event into a list of ``parts`` for AGENT messages. + + Captures both the model's "thought" text and any ``tool_call`` part + derived from ``tool_call_metadata``. + """ + parts: list[dict[str, Any]] = [] + thought = safe_get_attr(ev, "thought") + if thought: + parts.append({"type": "text", "content": safe_str(thought)}) + tcm = safe_get_attr(ev, "tool_call_metadata") + if tcm is not None: + fn_name = safe_str(safe_get_attr(tcm, "function_name") or "") + tcid = safe_str(safe_get_attr(tcm, "tool_call_id") or "") + # Best-effort harvest the original LLM-emitted JSON arguments. + args: Any = {} + try: + mr = safe_get_attr(tcm, "model_response") + choices = ( + getattr(mr, "choices", None) + if mr is not None + else None + ) or [] + for choice in choices: + msg = getattr(choice, "message", None) or ( + choice.get("message") if isinstance(choice, dict) else None + ) + tool_calls = ( + getattr(msg, "tool_calls", None) + if msg is not None + else None + ) or (msg.get("tool_calls") if isinstance(msg, dict) else None) + if not tool_calls: + continue + for tc in tool_calls: + tc_id = ( + getattr(tc, "id", None) + if not isinstance(tc, dict) + else tc.get("id") + ) + if tcid and safe_str(tc_id) != tcid: + continue + fn = ( + getattr(tc, "function", None) + if not isinstance(tc, dict) + else tc.get("function") + ) + raw = ( + getattr(fn, "arguments", None) + if not isinstance(fn, dict) + else fn.get("arguments") + ) + if isinstance(raw, str): + try: + import json as _json + + args = _json.loads(raw) + except Exception: + args = {"raw": raw} + elif isinstance(raw, dict): + args = raw + except Exception: + args = {} + if not args: + for key in ( + "command", + "code", + "path", + "url", + "content", + "task_list", + "old_str", + "new_str", + "file_text", + ): + v = safe_get_attr(ev, key) + if v not in (None, "", [], {}): + args[key] = v + if fn_name or tcid or args: + parts.append( + { + "type": "tool_call", + "id": tcid, + "name": fn_name or safe_str(safe_get_attr(ev, "action") or ""), + "arguments": args, + } + ) + if not parts: + # Minimal fallback when nothing else could be extracted. + action_type = safe_str(safe_get_attr(ev, "action") or "") + if action_type: + parts.append({"type": "tool_call", "name": action_type, "arguments": {}}) + return parts + + +def _observation_event_to_parts(ev: Any) -> list[dict[str, Any]]: + """Convert an Observation event into ``parts`` for tool-response messages.""" + tcm = safe_get_attr(ev, "tool_call_metadata") + tcid = safe_str(safe_get_attr(tcm, "tool_call_id") or "") if tcm else "" + result_payload: dict[str, Any] = {} + for key in ("content", "exit_code", "error", "stdout", "stderr", "url"): + v = safe_get_attr(ev, key) + if v not in (None, "", [], {}): + result_payload[key] = v + return [ + { + "type": "tool_call_response", + "id": tcid, + "result": result_payload or safe_str(ev), + } + ] + + +def _history_to_input_messages_schema(history: list, max_events: int = 200) -> list[dict[str, Any]]: + """Convert ``state.history`` into the ARMS gen_ai.input.messages schema. + + Folds adjacent same-role events into a single message with multiple + ``parts``, mirroring how the messages were assembled when sent to + the LLM. + """ + if not history: + return [] + items = history[-max_events:] + messages: list[dict[str, Any]] = [] + for ev in items: + cls = type(ev).__name__ + # Determine role + parts for this event. + if cls == "SystemMessageAction": + # System is reported separately under gen_ai.system_instructions. + continue + if cls == "MessageAction": + src = str(safe_get_attr(ev, "source") or "").lower() + role = "user" if src == "user" else "assistant" + content = safe_str(safe_get_attr(ev, "content") or "") + parts = [{"type": "text", "content": content}] + elif cls.endswith("Observation"): + role = "tool" + parts = _observation_event_to_parts(ev) + elif cls.endswith("Action"): + role = "assistant" + parts = _action_event_to_parts(ev) + else: + role = "system" + parts = [{"type": "text", "content": safe_str(ev)}] + # Fold consecutive same-role messages. + if messages and messages[-1]["role"] == role: + messages[-1]["parts"].extend(parts) + else: + messages.append({"role": role, "parts": parts}) + return messages + + +def _history_to_output_messages_schema(history: list) -> list[dict[str, Any]]: + """Pull the *final* assistant turn from history per ARMS gen_ai.output.messages. + + Walks back from the end of history and collects assistant-side events + (Actions) up to the previous user/tool boundary. Includes a + ``finish_reason`` derived from the last AgentFinishAction / state. + """ + if not history: + return [] + finish_reason = "stop" + tail_actions: list[Any] = [] + for ev in reversed(history): + cls = type(ev).__name__ + if cls == "AgentFinishAction": + finish_reason = safe_str( + safe_get_attr(ev, "final_thought") and "stop" or "stop" + ) + tail_actions.insert(0, ev) + continue + if cls.endswith("Observation") or cls == "MessageAction": + # Stop once we cross back into user-input or tool-result territory. + if cls == "MessageAction" and str( + safe_get_attr(ev, "source") or "" + ).lower() == "user": + break + if cls.endswith("Observation"): + break + if cls.endswith("Action") or ( + cls == "MessageAction" + and str(safe_get_attr(ev, "source") or "").lower() != "user" + ): + tail_actions.insert(0, ev) + if not tail_actions: + # Fallback: at least include the very last event as the assistant turn. + tail_actions = [history[-1]] + parts: list[dict[str, Any]] = [] + for ev in tail_actions: + cls = type(ev).__name__ + if cls == "MessageAction": + content = safe_str(safe_get_attr(ev, "content") or "") + if content: + parts.append({"type": "text", "content": content}) + elif cls == "AgentFinishAction": + ft = safe_str(safe_get_attr(ev, "final_thought") or "") + if ft: + parts.append({"type": "text", "content": ft}) + outputs = safe_get_attr(ev, "outputs") + if outputs: + parts.append({"type": "text", "content": safe_str(outputs)}) + else: + parts.extend(_action_event_to_parts(ev)) + if not parts: + parts = [{"type": "text", "content": ""}] + return [{"role": "assistant", "parts": parts, "finish_reason": finish_reason}] + + +def _agent_to_system_instructions(agent: Any, state: Any) -> list[dict[str, Any]]: + """Return ARMS gen_ai.system_instructions for the controller's agent. + + Tries the explicit ``agent.get_system_message()`` API first (most + accurate), then falls back to scanning ``state.history`` for a + ``SystemMessageAction``. + """ + content = "" + try: + gsm = safe_get_attr(agent, "get_system_message") + if callable(gsm): + sm = gsm() + content = safe_str(safe_get_attr(sm, "content") or "") + except Exception: + content = "" + if not content: + history = safe_get_attr(state, "history") or [] + if isinstance(history, list): + for ev in history: + if type(ev).__name__ == "SystemMessageAction": + content = safe_str(safe_get_attr(ev, "content") or "") + if content: + break + if not content: + return [] + return [{"type": "text", "content": content}] + + +# --------------------------------------------------------------------------- +# ENTRY: openhands.core.main.run_controller +# --------------------------------------------------------------------------- + + +class RunControllerWrapper: + """ENTRY span around the V0 CLI/headless ``run_controller`` coroutine. + + Stashes the active OTel Context (with the ENTRY span attached) keyed + by ``sid`` so STEP / TOOL spans firing in worker threads can re-attach + it and remain in the same trace. + """ + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + return self._impl(wrapped, instance, args, kwargs) + + async def _impl(self, wrapped, instance, args, kwargs): + if not OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS: + return await wrapped(*args, **kwargs) + + config = kwargs.get("config") + if config is None and args: + config = args[0] + initial_user_action = kwargs.get("initial_user_action") + if initial_user_action is None and len(args) >= 2: + initial_user_action = args[1] + sid = kwargs.get("sid") + if sid is None and len(args) >= 3: + sid = args[2] + # When sid wasn't passed, we don't yet know the auto-generated one; + # the controller will publish ``controller.id`` later. We update + # the stash again from inside the AGENT wrapper. + + span = self._tracer.start_span("enter openhands", kind=SpanKind.INTERNAL) + _set_common(span, "ENTRY") + span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "enter") + if sid: + span.set_attribute(GEN_AI_SESSION_ID, safe_str(sid)) + span.set_attribute(GEN_AI_CONVERSATION_ID, safe_str(sid)) + model = _extract_model_from_config(config) + if model: + span.set_attribute(GEN_AI_REQUEST_MODEL, model) + + input_text = _extract_input_message_text(initial_user_action) + preview = maybe_preview(input_text) + if preview: + span.set_attribute(OH_INITIAL_MESSAGE_PREVIEW, preview) + captured_input = ( + maybe_to_json_str({"role": "user", "content": input_text}) + if input_text + else "" + ) + if captured_input: + entry_input_messages = _entry_input_messages_from_initial( + initial_user_action + ) + _set_io( + span, + input_value=captured_input, + input_messages=entry_input_messages, + ) + + ctx = set_span_in_context(span) + token = otel_context.attach(ctx) + if sid: + store_context(sid, ctx) + try: + try: + result = await wrapped(*args, **kwargs) + except BaseException as exc: + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR, type(exc).__qualname__)) + raise + try: + final_state_repr = _final_state_to_output(result) + entry_input_messages, entry_output_messages = _entry_io_from_state( + result + ) + if final_state_repr: + _set_io( + span, + output_value=final_state_repr, + input_messages=entry_input_messages, + output_messages=entry_output_messages, + ) + agent_state = safe_get_attr(result, "agent_state") + if agent_state is not None: + span.set_attribute( + OH_AGENT_STATE, + safe_get_attr(agent_state, "value") or safe_str(agent_state), + ) + elif entry_input_messages or entry_output_messages: + _set_io( + span, + input_messages=entry_input_messages, + output_messages=entry_output_messages, + ) + except Exception: + pass + return result + finally: + try: + otel_context.detach(token) + except Exception: + pass + if sid: + clear_context(sid) + span.end() + + +# --------------------------------------------------------------------------- +# AGENT: openhands.core.loop.run_agent_until_done +# --------------------------------------------------------------------------- + + +class RunAgentUntilDoneWrapper: + """AGENT span around the V0 polling loop. + + Re-attaches the ENTRY context (in case asyncio task creation didn't + propagate it for some reason) and re-stashes a fresh context that now + also includes the AGENT span — that's what STEP / TOOL re-attach. + """ + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + return self._impl(wrapped, instance, args, kwargs) + + async def _impl(self, wrapped, instance, args, kwargs): + if not OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS: + return await wrapped(*args, **kwargs) + + controller = kwargs.get("controller") + if controller is None and args: + controller = args[0] + agent = safe_get_attr(controller, "agent") + agent_name = safe_get_attr(agent, "name") or "codeact" + agent_class = ( + f"{type(agent).__module__}.{type(agent).__name__}" if agent else "" + ) + sid = safe_str(safe_get_attr(controller, "id") or "") + llm = safe_get_attr(agent, "llm") + llm_config = safe_get_attr(llm, "config") + model = safe_get_attr(llm_config, "model") or safe_get_attr(llm, "model") + + # If AgentController.__init__ already opened lifecycle-bound ENTRY+AGENT + # spans, do not create a second AGENT here. Just run the loop with the + # existing AGENT context current so STEP/LLM/TOOL remain descendants. + lifecycle_agent_span = getattr(controller, _AGENT_SPAN_ATTR, None) + lifecycle_agent_ctx = getattr(controller, _AGENT_CTX_ATTR, None) + if lifecycle_agent_span is not None and lifecycle_agent_ctx is not None: + try: + _capture_agent_io_attributes( + lifecycle_agent_span, + controller, + agent, + safe_get_attr(controller, "state"), + ) + except Exception: + pass + lifecycle_token = otel_context.attach(lifecycle_agent_ctx) + try: + return await wrapped(*args, **kwargs) + except BaseException as exc: + try: + lifecycle_agent_span.record_exception(exc) + lifecycle_agent_span.set_status( + Status(StatusCode.ERROR, type(exc).__qualname__) + ) + except Exception: + pass + raise + finally: + try: + state = safe_get_attr(controller, "state") + _capture_agent_io_attributes( + lifecycle_agent_span, controller, agent, state + ) + history = safe_get_attr(state, "history") or [] + if isinstance(history, list): + lifecycle_agent_span.set_attribute(OH_HISTORY_LENGTH, len(history)) + except Exception: + pass + try: + otel_context.detach(lifecycle_token) + except Exception: + pass + + # Bridge: re-attach whatever the ENTRY wrapper stashed (works even + # if asyncio.create_task somehow lost the context, and is the only + # way for the worker-thread STEP / TOOL spans to find us). + attach_ctx = get_context(sid) + fallback_entry_span: trace_api.Span | None = None + if attach_ctx is None: + fallback_entry_span = self._tracer.start_span( + "enter openhands", kind=SpanKind.INTERNAL + ) + _set_common(fallback_entry_span, "ENTRY") + fallback_entry_span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "enter") + if sid: + fallback_entry_span.set_attribute(GEN_AI_SESSION_ID, sid) + fallback_entry_span.set_attribute(GEN_AI_CONVERSATION_ID, sid) + if agent_class: + fallback_entry_span.set_attribute(OH_AGENT_NAME, agent_class) + if model: + fallback_entry_span.set_attribute(GEN_AI_REQUEST_MODEL, safe_str(model)) + try: + state = safe_get_attr(controller, "state") + entry_input_messages, _ = _entry_io_from_state(state) + if entry_input_messages: + _set_io( + fallback_entry_span, + input_value=entry_input_messages, + input_messages=entry_input_messages, + ) + except Exception: + pass + attach_ctx = set_span_in_context(fallback_entry_span) + if sid: + store_context(sid, attach_ctx) + if attach_ctx is not None: + attach_token = otel_context.attach(attach_ctx) + else: + attach_token = None + + try: + span = self._tracer.start_span( + f"invoke_agent {agent_name}", + kind=SpanKind.INTERNAL, + context=attach_ctx, + ) + _set_common(span, "AGENT") + span.set_attribute( + GenAI.GEN_AI_OPERATION_NAME, + GenAI.GenAiOperationNameValues.INVOKE_AGENT.value, + ) + span.set_attribute(GenAI.GEN_AI_AGENT_NAME, safe_str(agent_name)) + if agent_class: + span.set_attribute(OH_AGENT_NAME, agent_class) + if sid: + span.set_attribute(GEN_AI_SESSION_ID, sid) + span.set_attribute(GEN_AI_CONVERSATION_ID, sid) + span.set_attribute(GEN_AI_AGENT_ID, sid) + if model: + span.set_attribute(GEN_AI_REQUEST_MODEL, safe_str(model)) + + # Capture the agent's tool registry so the TOOL wrapper (which + # only sees a Runtime instance) can resolve tool descriptions + # and produce ``gen_ai.tool.description``. Also emit + # ``gen_ai.tool.definitions`` on this AGENT span itself per the + # ARMS GenAI semconv §Agent — minimal {type,name} entries by + # default; full definitions only when content capture is on. + try: + tools = safe_get_attr(agent, "tools") or [] + if sid: + store_tool_registry(sid, tools) + tool_defs_summary: list[dict[str, Any]] = [] + for t in tools: + if isinstance(t, dict): + kind = t.get("type") or "function" + fn = t.get("function") or {} + name = fn.get("name") if isinstance(fn, dict) else None + else: + kind = safe_get_attr(t, "type") or "function" + fn = safe_get_attr(t, "function") + name = safe_get_attr(fn, "name") + if not name: + continue + item: dict[str, Any] = {"type": safe_str(kind), "name": safe_str(name)} + if isinstance(fn, dict): + desc = fn.get("description") + params = fn.get("parameters") + else: + desc = safe_get_attr(fn, "description") + params = safe_get_attr(fn, "parameters") + if desc: + item["description"] = safe_str(desc) + if params: + item["parameters"] = params + tool_defs_summary.append(item) + if tool_defs_summary: + span.set_attribute( + GEN_AI_TOOL_DEFINITIONS, to_json_str(tool_defs_summary) + ) + except Exception: + pass + + # Capture initial user/system context for AGENT using the same + # ARMS message schema as the lifecycle-bound AGENT path. + try: + state = safe_get_attr(controller, "state") + _capture_agent_io_attributes(span, controller, agent, state) + except Exception: + pass + + # Stash the context that now includes the AGENT span so STEP / + # TOOL re-attach correctly even when running in worker threads. + ctx_with_agent = set_span_in_context(span) + if sid: + store_context(sid, ctx_with_agent) + # Mirror onto the controller too — STEP wrapper uses this when + # closing a STEP to restore the session stash to AGENT instead + # of leaving a dangling closed-STEP context behind. + if controller is not None: + try: + setattr(controller, _AGENT_CTX_ATTR, ctx_with_agent) + setattr(controller, _AGENT_SPAN_ATTR, span) + except Exception: + pass + if getattr(controller, _STEP_SPAN_ATTR, None) is None: + try: + warmup_step = self._tracer.start_span( + "react step", + kind=SpanKind.INTERNAL, + context=ctx_with_agent, + ) + _set_common(warmup_step, "STEP") + warmup_step.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "react") + warmup_step.set_attribute(OH_REACT_ROUND, 1) + warmup_step.set_attribute( + GenAI.GEN_AI_AGENT_NAME, safe_str(agent_name) + ) + if sid: + warmup_step.set_attribute(GEN_AI_SESSION_ID, sid) + warmup_step.set_attribute(GEN_AI_CONVERSATION_ID, sid) + warmup_step.set_attribute(GEN_AI_AGENT_ID, sid) + setattr(controller, _STEP_SPAN_ATTR, warmup_step) + setattr(controller, "_otel_oh_round", 1) + setattr(controller, "_otel_oh_step_consumed", False) + if sid: + store_context(sid, set_span_in_context(warmup_step)) + except Exception: + pass + agent_token = otel_context.attach(ctx_with_agent) + try: + try: + result = await wrapped(*args, **kwargs) + except BaseException as exc: + span.record_exception(exc) + span.set_status( + Status(StatusCode.ERROR, type(exc).__qualname__) + ) + raise + # Capture final AGENT I/O using ARMS gen_ai.* message attrs. + try: + state = safe_get_attr(controller, "state") + _capture_agent_io_attributes(span, controller, agent, state) + output_repr = _final_state_to_output(state) + if output_repr: + _set_io(span, output_value=output_repr) + if state is not None: + agent_state = safe_get_attr(state, "agent_state") + if agent_state is not None: + span.set_attribute( + OH_AGENT_STATE, + safe_get_attr(agent_state, "value") + or safe_str(agent_state), + ) + history = safe_get_attr(state, "history") or [] + if isinstance(history, list): + span.set_attribute(OH_HISTORY_LENGTH, len(history)) + except Exception: + pass + return result + finally: + try: + otel_context.detach(agent_token) + except Exception: + pass + if controller is not None: + try: + if getattr(controller, _AGENT_SPAN_ATTR, None) is span: + setattr(controller, _AGENT_SPAN_ATTR, None) + except Exception: + pass + try: + _close_open_step(controller) + except Exception: + pass + span.end() + finally: + if attach_token is not None: + try: + otel_context.detach(attach_token) + except Exception: + pass + if fallback_entry_span is not None: + try: + state = safe_get_attr(controller, "state") + output_repr = _final_state_to_output(state) + entry_input_messages, entry_output_messages = _entry_io_from_state( + state + ) + if output_repr: + _set_io( + fallback_entry_span, + output_value=output_repr, + input_messages=entry_input_messages, + output_messages=entry_output_messages, + ) + elif entry_input_messages or entry_output_messages: + _set_io( + fallback_entry_span, + input_messages=entry_input_messages, + output_messages=entry_output_messages, + ) + history = safe_get_attr(state, "history") or [] + if isinstance(history, list): + fallback_entry_span.set_attribute(OH_HISTORY_LENGTH, len(history)) + except Exception: + pass + try: + fallback_entry_span.end() + except Exception: + pass + if sid: + try: + clear_context(sid) + except Exception: + pass + + +# --------------------------------------------------------------------------- +# STEP: AgentController._step +# --------------------------------------------------------------------------- + + +def _close_open_step(controller: Any) -> None: + """End the controller's currently-open STEP span, if any. + + Restores the session-context stash to the controller's AGENT context + (kept under ``_AGENT_CTX_ATTR``) so subsequent TOOL spans are still + parented correctly even after the last STEP closes. + + Crucially, this function only ends the *span* — it never touches an + attach-token. The STEP wrapper attaches/detaches the STEP context + in a balanced pair *inside* the ``_step`` coroutine; cross-task + propagation happens via the ``Context`` object stashed in + :mod:`session_context`, which can be re-attached safely from any + task / thread because every attach is paired with a detach inside + its creating context. + """ + span = getattr(controller, _STEP_SPAN_ATTR, None) + if span is None: + return + try: + span.end() + except Exception: + pass + try: + setattr(controller, _STEP_SPAN_ATTR, None) + except Exception: + pass + sid = safe_str(safe_get_attr(controller, "id") or "") + agent_ctx = getattr(controller, _AGENT_CTX_ATTR, None) + if sid and agent_ctx is not None: + store_context(sid, agent_ctx) + + +class AgentControllerStepWrapper: + """STEP span around one ReAct iteration of the V0 controller. + + The STEP span is intentionally **kept open across the return of + ``_step``**. Why: ``Runtime.run_action`` runs *later*, in a thread-pool + executor (``call_sync_from_async`` inside ``_handle_action``), so by + the time TOOL fires the STEP coroutine has already returned. Closing + STEP at end of ``_step`` would make every TOOL a sibling of STEP + (parented under AGENT) instead of a child. + + Lifecycle: + + 1. New ``_step`` invoked → close *previous* STEP if any → open new + STEP (child of AGENT) → stash STEP context under ``sid`` so that + TOOL / LLM spans firing on worker threads re-attach STEP. + 2. ``_step`` body runs to completion. We do **not** close STEP here. + 3. The next ``_step`` (or ``AgentController.close``) closes the + still-open STEP. + """ + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + return self._impl(wrapped, instance, args, kwargs) + + @staticmethod + def _will_step_be_noop(instance: Any) -> bool: + """Return True if this ``_step`` call will short-circuit without + producing real work (state != RUNNING, or a pending action is + already queued). We skip span emission for these so the round + counter stays sequential (1, 2, 3, ...) instead of inflating to + (1, 3, 5, ...) with empty 0.5ms STEP spans cluttering the trace. + + This mirrors the early-return checks at the top of + ``AgentController._step`` (state-check + ``_pending_action``). + We read ``_pending_action_info`` directly rather than going + through the ``_pending_action`` *property* — the property has + logging side effects (it can emit a "pending action active for + Xs" log line at warn-level) that we don't want to trigger from + an instrumentation hot path. + """ + try: + state = safe_get_attr(instance, "state") + agent_state = safe_get_attr(state, "agent_state") + # AgentState enum value is 'running' (case-insensitive). + agent_state_str = ( + safe_str(safe_get_attr(agent_state, "value") or agent_state).lower() + ) + if agent_state_str != "running": + return True + # Check the underlying tuple slot, not the property — the + # property's getter is non-trivial in OpenHands. + if getattr(instance, "_pending_action_info", None) is not None: + return True + except Exception: + return False + return False + + @staticmethod + def _snapshot_for_work_detection(instance: Any) -> tuple[int, Any]: + """Snapshot the bits we need to tell whether ``_step`` body did + anything. Returned tuple is (history_length, pending_action_id). + Used by ``_impl`` to detect "empty" STEP invocations that get + through ``_will_step_be_noop`` (e.g. ``state_tracker`` raised, + ``_is_stuck`` early-returned, ``agent.step`` returned ``None``) + and shouldn't show up in the trace as 0.3ms placeholder spans. + """ + try: + state = safe_get_attr(instance, "state") + history = safe_get_attr(state, "history") + history_len = len(history) if isinstance(history, list) else 0 + except Exception: + history_len = 0 + try: + info = getattr(instance, "_pending_action_info", None) + pending_id = id(info) if info is not None else None + except Exception: + pending_id = None + return history_len, pending_id + + async def _impl(self, wrapped, instance, args, kwargs): + if not OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS: + return await wrapped(*args, **kwargs) + + # Skip no-op _step invocations entirely so the trace shows only + # the rounds that actually do work (LLM call + tool dispatch). + if self._will_step_be_noop(instance): + return await wrapped(*args, **kwargs) + + sid = safe_str(safe_get_attr(instance, "id") or "") + agent = safe_get_attr(instance, "agent") + agent_name = safe_get_attr(agent, "name") or "codeact" + + # Snapshot the AGENT context if we don't already have one so + # ``_close_open_step`` can restore the session stash to AGENT + # after STEP ends. + if not hasattr(instance, _AGENT_CTX_ATTR) or getattr(instance, _AGENT_CTX_ATTR, None) is None: + try: + setattr(instance, _AGENT_CTX_ATTR, get_context(sid)) + except Exception: + pass + + # ----- Reuse warmup STEP if not yet consumed ----- + # The init wrapper opens a warmup STEP (round 1) so pre-step + # actions like RECALL parent under STEP 1. The first real + # ``_step`` reuses that STEP (without bumping the round) so the + # LLM call + first LLM-driven tool also nest under STEP 1. From + # the second real ``_step`` onward, we close the previous STEP + # and open a new one with round = previous + 1. + existing_step = getattr(instance, _STEP_SPAN_ATTR, None) + consumed = bool(getattr(instance, "_otel_oh_step_consumed", True)) + reused_warmup = False + is_new_span = False + if existing_step is not None and not consumed: + span = existing_step + round_num = int(getattr(instance, "_otel_oh_round", 1) or 1) + reused_warmup = True + try: + setattr(instance, "_otel_oh_step_consumed", True) + except Exception: + pass + else: + # Close any still-open consumed STEP from the previous round + # before opening a new one. + _close_open_step(instance) + # Tentative round number — only committed if body does work. + round_num = int(getattr(instance, "_otel_oh_round", 0) or 0) + 1 + + # Open the new STEP as a child of AGENT. Prefer the explicit + # AGENT context (more reliable than relying on contextvars + # propagation across asyncio task / thread boundaries). + agent_ctx = getattr(instance, _AGENT_CTX_ATTR, None) + if agent_ctx is None and sid: + agent_ctx = get_context(sid) + try: + span = self._tracer.start_span( + "react step", + kind=SpanKind.INTERNAL, + context=agent_ctx, + ) + except Exception: + # Fall back to current-context-based parenting if explicit + # context= isn't accepted (older OTel SDKs). + with AttachedSession(sid): + span = self._tracer.start_span( + "react step", kind=SpanKind.INTERNAL + ) + _set_common(span, "STEP") + span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "react") + span.set_attribute(OH_REACT_ROUND, round_num) + span.set_attribute(GenAI.GEN_AI_AGENT_NAME, safe_str(agent_name)) + if sid: + span.set_attribute(GEN_AI_SESSION_ID, sid) + span.set_attribute(GEN_AI_CONVERSATION_ID, sid) + span.set_attribute(GEN_AI_AGENT_ID, sid) + is_new_span = True + try: + setattr(instance, _STEP_SPAN_ATTR, span) + setattr(instance, "_otel_oh_step_consumed", True) + except Exception: + try: + span.end() + except Exception: + pass + return await wrapped(*args, **kwargs) + + # Capture INPUT: messages going into this step. + try: + state = safe_get_attr(instance, "state") + history = safe_get_attr(state, "history") or [] + if isinstance(history, list): + span.set_attribute(OH_HISTORY_LENGTH, len(history)) + input_messages = _state_to_input_messages(state) + if input_messages: + _set_io( + span, + input_value=input_messages, + input_messages=input_messages, + ) + except Exception: + pass + + # Build the STEP context object. Cross-thread propagation goes + # through this Context object stashed in session_context (TOOL / + # LLM wrappers re-attach it inside their own scopes with paired + # attach/detach so no token ever crosses a context boundary). + step_ctx = set_span_in_context(span) + if sid: + store_context(sid, step_ctx) + + # Snapshot pre-body state so we can detect "empty" body that + # got through ``_will_step_be_noop`` (e.g. ``state_tracker`` + # raised inside ``_step``, ``_is_stuck`` early-returned, or + # ``agent.step`` returned ``None`` / raised handled error). + pre_history_len, pre_pending_id = self._snapshot_for_work_detection( + instance + ) + + # Attach STEP for the *body's* contextvars propagation only. + # Both attach and the matching detach happen in this coroutine's + # own context, so the Aliyun SDK's strict token check is happy. + step_token = otel_context.attach(step_ctx) + body_error: BaseException | None = None + try: + result = await wrapped(*args, **kwargs) + except BaseException as exc: + body_error = exc + finally: + try: + otel_context.detach(step_token) + except Exception: + pass + + if body_error is not None: + try: + span.set_attribute( + "gen_ai.react.finish_reason", type(body_error).__qualname__ + ) + span.record_exception(body_error) + span.set_status( + Status(StatusCode.ERROR, type(body_error).__qualname__) + ) + except Exception: + pass + # On error, close STEP now so the failure surfaces cleanly + # rather than waiting for the next _step / controller close. + _close_open_step(instance) + # Make sure the round counter we *tentatively* assigned for + # this STEP gets committed so subsequent rounds renumber + # past it instead of overlapping. + if is_new_span: + try: + instance._otel_oh_round = round_num + except Exception: + pass + raise body_error + + # Detect post-body "empty" STEP — the wrapper passed the + # ``_will_step_be_noop`` pre-check but the body still produced + # zero observable work (no new history events, no new pending + # action). The user has explicitly asked us not to clutter the + # trace with sub-millisecond placeholder STEP spans, so: + # + # * If we *opened* a fresh span this round, end it immediately, + # mark it ``openhands.step.empty=true``, and DO NOT bump the + # committed round counter. Next real _step opens a fresh STEP + # with the same round number — the empty span still appears + # in the trace (we have no way to suppress export from inside + # a wrapper), but with a clear ``empty=true`` marker so it's + # trivially filterable in the dashboard. + # * If we *reused* a warmup / persisted STEP that was already + # meaningful (had earlier RECALL/TOOL children), keep it open + # and don't mark it empty — the children give it value. + post_history_len, post_pending_id = self._snapshot_for_work_detection( + instance + ) + did_work = ( + post_history_len > pre_history_len + or (post_pending_id is not None and post_pending_id != pre_pending_id) + ) + + if not did_work and is_new_span: + try: + span.set_attribute("openhands.step.empty", True) + span.set_attribute( + "gen_ai.react.finish_reason", "noop_step_body" + ) + span.end() + except Exception: + pass + # Forget this empty STEP so the next _step opens a fresh one + # without trying to close-or-reuse this one. + try: + if getattr(instance, _STEP_SPAN_ATTR, None) is span: + setattr(instance, _STEP_SPAN_ATTR, None) + except Exception: + pass + try: + # Roll back to the previous committed round (don't + # advance the counter for an empty STEP). + instance._otel_oh_round = round_num - 1 + instance._otel_oh_step_consumed = True + except Exception: + pass + # Restore session stash to AGENT so subsequent TOOLs land + # under AGENT (not under a now-ended STEP). + if sid: + agent_ctx = getattr(instance, _AGENT_CTX_ATTR, None) + if agent_ctx is not None: + try: + store_context(sid, agent_ctx) + except Exception: + pass + return result + + # Body did work — commit the round counter (we only update it + # *after* we're sure the STEP is meaningful). + if is_new_span: + try: + instance._otel_oh_round = round_num + except Exception: + pass + + # Capture OUTPUT: the freshly-decided pending action. + try: + pending = getattr(instance, "_pending_action", None) + state = safe_get_attr(instance, "state") + agent_state = safe_get_attr(state, "agent_state") + if agent_state is not None: + span.set_attribute( + OH_AGENT_STATE, + safe_get_attr(agent_state, "value") + or safe_str(agent_state), + ) + if pending is not None: + action_type = _action_type_value(pending) + if action_type: + span.set_attribute(OH_ACTION_TYPE, action_type) + out = action_to_genai_output(pending) + if out: + _set_io(span, output_value=out, output_messages=out) + except Exception: + pass + + # Mirror the latest history snapshot back up to the AGENT span + # so AGENT's input.value / gen_ai.input.messages stay current + # *during* the run (not just at close-time). The user wants to + # see the conversation accumulate on AGENT live, since the + # downstream dashboards may read AGENT before the controller + # actually closes. + try: + agent_span = getattr(instance, _AGENT_SPAN_ATTR, None) + if agent_span is not None: + _capture_agent_io_attributes( + agent_span, instance, agent, safe_get_attr(instance, "state") + ) + except Exception: + pass + + # Mark the warmup STEP (round 1) the moment we know it carries + # real work — it now contains LLM/TOOL children and matters. + if reused_warmup: + try: + span.set_attribute("openhands.step.warmup_consumed", True) + except Exception: + pass + + # STEP span stays open here — it lives until the next _step (or + # AgentController.close) ends it. Until then any TOOL fired by + # Runtime.run_action on a thread-pool worker will re-attach the + # STEP context object stashed above and become its child. + return result + + +# --------------------------------------------------------------------------- +# TOOL: Runtime.run_action +# --------------------------------------------------------------------------- + + +_TOOL_KIND_TO_NAME: dict[str, str] = { + "run": "bash", + "run_ipython": "ipython", + "browse_interactive": "browser", + "browse": "browser", + "edit": "str_replace_editor", + "read": "file_read", + "write": "file_write", + "delegate": "delegate", + "finish": "finish", + "think": "think", + "task_tracking": "task_tracker", + "mcp": "mcp", + "send_message": "send_message", + # ``recall`` is a real (non-LLM-initiated) tool: the controller posts + # a RecallAction and the memory subsystem runs it just like any other + # action via ``Runtime.run_action``. Worth a TOOL span. + "recall": "recall", +} + +# Action types that are *not* real tool calls — they're internal control +# events posted by the controller / event-stream itself (system prompt, +# user message, agent-state transition, no-ops). Emitting TOOL spans for +# these clutters the trace tree and confuses the GenAI semconv (these +# aren't things the LLM "called"). +_INTERNAL_ACTION_TYPES: frozenset[str] = frozenset( + { + "message", + "system", + "change_agent_state", + "agent_state_changed", + "null", + "noop", + } +) + + +def _action_type_value(action: Any) -> str: + """Best-effort extract the canonical action-type string for ``action``. + + OpenHands declares ``ActionType`` as ``class ActionType(str, Enum)`` + with members like ``MESSAGE = 'message'``. Each Action subclass sets + ``action: str = ActionType.MESSAGE``. ``str(ActionType.MESSAGE)`` + returns ``'ActionType.MESSAGE'`` (Python's default Enum.__str__), + *not* the value ``'message'`` we want for filtering / lookup. This + helper prefers ``.value`` when the attribute is enum-like, else the + raw string. + """ + raw = safe_get_attr(action, "action") + if raw is None: + return "" + val = safe_get_attr(raw, "value") + if val is not None: + return safe_str(val).lower() + text = safe_str(raw).lower() + # ``str(ActionType.MESSAGE)`` → "actiontype.message"; strip the prefix. + prefix = "actiontype." + if text.startswith(prefix): + return text[len(prefix):] + return text + + +def _is_real_tool_call(action: Any) -> bool: + """Return True iff ``action`` represents a meaningful tool execution. + + Filtering rules (in order): + + 1. **Internal action types are *always* dropped** even when the + action carries ``tool_call_metadata``. OpenHands lets the LLM + produce ``MessageAction`` (via the ``send_message`` "tool"), + ``SystemMessageAction``, ``ChangeAgentStateAction`` etc. — those + are coordination signals, not real tool executions, and they + clutter the trace with sub-millisecond noise spans that the user + has explicitly asked us to suppress. + 2. Otherwise, an action qualifies if it has ``tool_call_metadata`` + (i.e. it was produced from an LLM ``tool_calls`` response — e.g. + ``execute_bash``, ``str_replace_editor``), or + 3. Its action-type is in the executable-tool whitelist + (``_TOOL_KIND_TO_NAME``) — this catches synthesized actions like + ``RECALL`` that don't come from the LLM but are still worth + tracing as TOOL spans (memory retrieval, microagent loading, + etc.). + """ + action_type = _action_type_value(action) + # Always drop internal/system actions regardless of how they were + # produced — see rule 1 above. + if action_type and action_type in _INTERNAL_ACTION_TYPES: + return False + if safe_get_attr(action, "tool_call_metadata") is not None: + return True + if not action_type: + return False + return action_type in _TOOL_KIND_TO_NAME + + +def _extract_tool_name(action: Any) -> tuple[str, str]: + """Return (tool_name, action_type). + + Prefers the function name carried on ``action.tool_call_metadata`` + (set when the action came from an LLM tool call) — that's what the + LLM and our LLM-side instrumentation know it as. Falls back to the + canonical action-type string (``ActionType.RECALL`` → ``"recall"``) + mapped through ``_TOOL_KIND_TO_NAME``. + """ + action_type = _action_type_value(action) + tcm = safe_get_attr(action, "tool_call_metadata") + if tcm is not None: + fn = safe_get_attr(tcm, "function_name") + if fn: + return safe_str(fn), action_type + tool_name = _TOOL_KIND_TO_NAME.get(action_type, action_type or "agent.action") + return tool_name, action_type + + +def _extract_tool_call_id(action: Any) -> str: + tcm = safe_get_attr(action, "tool_call_metadata") + if tcm is None: + return "" + return safe_str(safe_get_attr(tcm, "tool_call_id") or "") + + +def _runtime_sid(instance: Any) -> str: + """Best-effort discover the session id from a Runtime instance.""" + sid = safe_get_attr(instance, "sid") + if sid: + return safe_str(sid) + es = safe_get_attr(instance, "event_stream") + es_sid = safe_get_attr(es, "sid") + if es_sid: + return safe_str(es_sid) + return "" + + +class RuntimeRunActionWrapper: + """TOOL span around ``Runtime.run_action``. + + Bridges the session context across worker threads, then opens a TOOL + span whose ``input.value`` describes the action and whose + ``output.value`` describes the resulting observation. + """ + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + if not OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS: + return wrapped(*args, **kwargs) + + action = args[0] if args else kwargs.get("action") + # Skip internal control events — system prompts, user messages, + # memory recalls, agent-state transitions etc. aren't tool calls + # and shouldn't appear as TOOL spans alongside the real ones. + if not _is_real_tool_call(action): + return wrapped(*args, **kwargs) + + tool_name, action_type = _extract_tool_name(action) + tool_call_id = _extract_tool_call_id(action) + runtime_class = ( + f"{type(instance).__module__}.{type(instance).__name__}" + if instance + else "" + ) + sid = _runtime_sid(instance) + + # Look up the session-stashed context (STEP if a step is open, + # AGENT otherwise) and use it as the *explicit* parent context + # for the TOOL span. Explicit context= is more robust than + # relying on contextvars propagation across worker threads — it + # always parents under the latest STEP/AGENT no matter what + # thread/loop the runtime is running on. + parent_ctx = get_context(sid) + try: + span = self._tracer.start_span( + f"execute_tool {tool_name}", + kind=SpanKind.INTERNAL, + context=parent_ctx, + ) + except Exception: + with AttachedSession(sid): + span = self._tracer.start_span( + f"execute_tool {tool_name}", kind=SpanKind.INTERNAL + ) + # The TOOL span itself is parented *explicitly* via context= + # above. We additionally attach the session context throughout + # the wrapped call so any nested spans created by the runtime + # (e.g. a retried LLM call) that go through the contextvars + # propagation path also inherit the right session — and the + # ``otel_context.attach(set_span_in_context(span))`` below makes + # the TOOL itself current so retry-spawned child spans nest + # under TOOL, not under its parent STEP. + with AttachedSession(sid): + # ARMS GenAI semconv (Tool): + # gen_ai.span.kind=TOOL, gen_ai.operation.name=execute_tool, + # gen_ai.tool.name, gen_ai.tool.type + # gen_ai.tool.call.id, gen_ai.tool.description [recommended] + # gen_ai.tool.call.arguments, gen_ai.tool.call.result + # [optional, gated on capture-message-content] + _set_common(span, "TOOL") + span.set_attribute( + GenAI.GEN_AI_OPERATION_NAME, + GenAI.GenAiOperationNameValues.EXECUTE_TOOL.value, + ) + span.set_attribute(GenAI.GEN_AI_TOOL_NAME, tool_name) + span.set_attribute(GenAI.GEN_AI_TOOL_TYPE, "function") + if tool_call_id: + span.set_attribute(GEN_AI_TOOL_CALL_ID, tool_call_id) + if action_type: + # ``action_type`` from ``_extract_tool_name`` is the + # canonical lowercased value (e.g. ``"recall"``), suitable + # for ``openhands.action.type``. + span.set_attribute(OH_ACTION_TYPE, action_type) + if runtime_class: + span.set_attribute(OH_RUNTIME_NAME, runtime_class) + if sid: + span.set_attribute(GEN_AI_SESSION_ID, sid) + span.set_attribute(GEN_AI_CONVERSATION_ID, sid) + + # gen_ai.tool.description — looked up via the per-sid registry + # populated by the AGENT wrapper from ``controller.agent.tools``. + try: + tool_def = get_tool_definition(sid, tool_name) + if tool_def is not None: + if isinstance(tool_def, dict): + fn = tool_def.get("function") or {} + desc = fn.get("description") if isinstance(fn, dict) else None + else: + fn = safe_get_attr(tool_def, "function") + desc = safe_get_attr(fn, "description") + if desc: + span.set_attribute(GEN_AI_TOOL_DESCRIPTION, safe_str(desc)) + except Exception: + pass + + # gen_ai.tool.call.arguments + input.value + arguments_dict = _tool_call_arguments(action) + try: + if arguments_dict: + args_json = to_json_str(arguments_dict) + if args_json: + span.set_attribute(GEN_AI_TOOL_CALL_ARGUMENTS, args_json) + # OpenInference compat — input.value mirrors the args. + _set_io(span, input_value=args_json) + # Convenience preview attribute on the action's primary + # input field (command / code / path / ...). + preview_field, preview_text = _first_preview_field(action) + if preview_text: + span.set_attribute( + f"openhands.action.{preview_field}", preview_text + ) + except Exception: + pass + + ctx = set_span_in_context(span) + token = otel_context.attach(ctx) + try: + try: + observation = wrapped(*args, **kwargs) + except BaseException as exc: + span.record_exception(exc) + span.set_status( + Status(StatusCode.ERROR, type(exc).__qualname__) + ) + raise + try: + _annotate_observation(span, observation) + except Exception: + pass + return observation + finally: + try: + otel_context.detach(token) + except Exception: + pass + span.end() + + +def _first_preview_field(action: Any) -> tuple[str, str]: + for attr in ("command", "code", "path", "url", "content"): + v = safe_get_attr(action, attr) + if v: + return attr, safe_str(v) + return "", "" + + +_TOOL_ARG_FIELDS: tuple[str, ...] = ( + "command", + "code", + "path", + "url", + "content", + "task_list", + "name", + "arguments", + "thought", + "is_input", + "blocking", + "keep_prompt", + "translated_ipython_code", + "browser_actions", + "agent_state", + "outputs", + "final_thought", + "old_str", + "new_str", + "view_range", + "file_text", + "insert_line", + "start_line", + "end_line", +) + + +def _tool_call_arguments(action: Any) -> dict[str, Any]: + """Return the bare arguments dict for ``gen_ai.tool.call.arguments``. + + Per ARMS GenAI semconv the value is a JSON string of *just* the call + arguments — e.g. ``{"location": "San Francisco", "date": "2025-10-01"}`` + — not the wrapping ``{"tool": ..., "arguments": ...}`` envelope. + """ + if action is None: + return {} + # When the action came from an LLM tool call, prefer the original + # JSON arguments the model emitted (most faithful to what the LLM + # actually requested). + tcm = safe_get_attr(action, "tool_call_metadata") + model_response = safe_get_attr(tcm, "model_response") if tcm else None + if model_response is not None: + try: + choices = ( + model_response.choices + if hasattr(model_response, "choices") + else None + ) or [] + for choice in choices: + msg = getattr(choice, "message", None) or ( + choice.get("message") if isinstance(choice, dict) else None + ) + tool_calls = ( + getattr(msg, "tool_calls", None) + if msg is not None + else None + ) or (msg.get("tool_calls") if isinstance(msg, dict) else None) + if not tool_calls: + continue + want_id = safe_str(safe_get_attr(tcm, "tool_call_id") or "") + for tc in tool_calls: + tc_id = ( + getattr(tc, "id", None) + if not isinstance(tc, dict) + else tc.get("id") + ) + if want_id and safe_str(tc_id) != want_id: + continue + fn = ( + getattr(tc, "function", None) + if not isinstance(tc, dict) + else tc.get("function") + ) + raw_args = ( + getattr(fn, "arguments", None) + if not isinstance(fn, dict) + else fn.get("arguments") + ) + if isinstance(raw_args, str): + try: + import json as _json + + return _json.loads(raw_args) + except Exception: + return {"raw": raw_args} + if isinstance(raw_args, dict): + return raw_args + except Exception: + pass + # Fallback: harvest known argument-bearing fields off the Action object. + args: dict[str, Any] = {} + for key in _TOOL_ARG_FIELDS: + v = safe_get_attr(action, key) + if v not in (None, "", [], {}): + args[key] = v + return args + + +def _observation_to_result(observation: Any) -> dict[str, Any]: + """Return a dict suitable for ``gen_ai.tool.call.result``.""" + if observation is None: + return {} + payload: dict[str, Any] = {} + for key in ( + "content", + "exit_code", + "error", + "interpreter_details", + "command", + "stdout", + "stderr", + "url", + "screenshot", + "outputs", + ): + v = safe_get_attr(observation, key) + if v not in (None, "", [], {}): + payload[key] = v + return payload + + +def _annotate_observation(span: trace_api.Span, observation: Any) -> None: + if observation is None: + return + obs_type = safe_str( + safe_get_attr(observation, "observation") or type(observation).__name__ + ) + if obs_type: + span.set_attribute(OH_OBSERVATION_TYPE, obs_type) + exit_code = safe_get_attr(observation, "exit_code") + if exit_code is not None: + try: + ec = int(exit_code) + span.set_attribute("openhands.action.exit_code", ec) + if ec != 0: + span.set_status(Status(StatusCode.ERROR, f"exit_code={ec}")) + except (TypeError, ValueError): + pass + error = safe_get_attr(observation, "error") + if error: + span.set_attribute("openhands.observation.error", safe_str(error)) + span.set_status(Status(StatusCode.ERROR, safe_str(error))) + # Emit gen_ai.tool.call.result + OpenInference output.value. + try: + result_payload = _observation_to_result(observation) + result_payload.setdefault("observation", obs_type) + out = to_json_str(result_payload) + if out: + span.set_attribute(GEN_AI_TOOL_CALL_RESULT, out) + span.set_attribute(OUTPUT_VALUE, out) + span.set_attribute(OUTPUT_MIME, "application/json") + except Exception: + pass + + +# --------------------------------------------------------------------------- +# ENTRY + AGENT (controller-lifecycle bound) +# +# Why this exists in addition to RunControllerWrapper / RunAgentUntilDoneWrapper: +# +# When OpenHands V0 is launched via ``python -m openhands.core.main``, Python +# executes ``main.py`` *as ``__main__``*. The ``from openhands.core.loop +# import run_agent_until_done`` (and other from-imports) at the top of +# ``main.py`` bind those symbols into ``__main__``'s namespace **before** +# our instrumentor patches ``openhands.core.main.run_controller`` / +# ``openhands.core.loop.run_agent_until_done``. The ``__main__`` block's +# ``asyncio.run(run_controller(...))`` call uses the *unpatched* local +# reference, so the wrappers above never fire — and the trace appears +# without an ENTRY span. +# +# STEP / TOOL spans work because ``_step`` and ``run_action`` are *class +# methods*: patching ``AgentController._step`` updates the class object +# that both ``__main__.AgentController`` and +# ``openhands.controller.agent_controller.AgentController`` reference, so +# every method lookup at call time finds the wrapped version. +# +# ENTRY+AGENT here exploit the same principle — they hook +# ``AgentController.__init__`` and ``AgentController.close``, both class +# methods, so the spans bracket the controller's lifecycle reliably no +# matter how ``run_controller`` was invoked. They no-op when a session +# context is already stashed for this sid (i.e. ``RunControllerWrapper`` +# fired successfully — the API/test-suite code path). +# --------------------------------------------------------------------------- + + +def _capture_agent_io_attributes( + span: trace_api.Span, controller: Any, agent: Any, state: Any +) -> None: + """Set gen_ai.system_instructions / input.messages / output.messages on + the AGENT span, following the ARMS GenAI semconv schema.""" + try: + sys_instr = _agent_to_system_instructions(agent, state) + if sys_instr: + payload = to_json_str(sys_instr) + if payload: + span.set_attribute(GEN_AI_SYSTEM_INSTRUCTIONS, payload) + # Some downstream ARMS views still look for the legacy singular key. + span.set_attribute(GEN_AI_SYSTEM_INSTRUCTION, payload) + except Exception: + pass + try: + history = safe_get_attr(state, "history") or [] + if isinstance(history, list) and history: + input_msgs = _history_to_input_messages_schema(history) + if input_msgs: + payload = to_json_str(input_msgs) + if payload: + span.set_attribute(GEN_AI_INPUT_MESSAGES, payload) + _set_io(span, input_value=payload) + output_msgs = _history_to_output_messages_schema(history) + if output_msgs: + payload = to_json_str(output_msgs) + if payload: + span.set_attribute(GEN_AI_OUTPUT_MESSAGES, payload) + _set_io(span, output_value=payload) + except Exception: + pass + + +def _open_entry_and_agent_for_controller( + tracer: Tracer, controller: Any +) -> None: + """Open ENTRY (parent) + AGENT (child) + warmup STEP for ``controller``. + + Opening a *warmup STEP* (round 1) right after AGENT means that any + pre-step actions like RECALL — which are dispatched to the runtime + *before* the first ``_step`` invocation — become children of STEP 1 + instead of dangling siblings under AGENT. The first real ``_step`` + call detects that the warmup STEP isn't yet "consumed" and reuses + it (without bumping the round counter) so the LLM call + first + LLM-driven tool also nest under STEP 1. + + All inner span creations use the explicit ``context=`` argument + (instead of relying on ``contextvars`` propagation through + ``otel_context.attach``) — this is the most deterministic way to + parent a child span and avoids the entire class of "Token was + created in a different Context" failures we used to chase across + asyncio-task / thread boundaries. + + Idempotent on ``_OWNS_FLAG`` — safe to call multiple times for the + same controller. Deliberately does **not** check whether a session + context is already stashed: under ``python -m openhands.core.main`` + the from-import binding bypasses ``RunControllerWrapper`` and + ``RunAgentUntilDoneWrapper``, so the init wrapper is the only + reliable source of ENTRY+AGENT and must always run. + """ + if not OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS: + return + if getattr(controller, _OWNS_FLAG, False): + # Already opened (e.g. RunControllerWrapper fired first) — log + # and bail. We don't want to double-emit ENTRY/AGENT. + logger.debug( + "OpenHands instrumentation: ENTRY+AGENT already open on " + "controller %s — skipping init-wrapper open", + id(controller), + ) + return + + sid = safe_str(safe_get_attr(controller, "id") or "") + agent = safe_get_attr(controller, "agent") + agent_name = safe_get_attr(agent, "name") or "codeact" + agent_class = ( + f"{type(agent).__module__}.{type(agent).__name__}" if agent else "" + ) + llm = safe_get_attr(agent, "llm") + llm_config = safe_get_attr(llm, "config") + model = safe_get_attr(llm_config, "model") or safe_get_attr(llm, "model") + + # ----- ENTRY ----- + # If RunControllerWrapper already stashed an ENTRY context, parent AGENT + # directly under it. Otherwise create the lifecycle-owned ENTRY here. + entry: trace_api.Span | None = None + entry_ctx = get_context(sid) + if entry_ctx is None: + try: + entry = tracer.start_span("enter openhands", kind=SpanKind.INTERNAL) + except Exception as exc: + logger.error( + "OpenHands instrumentation: failed to start ENTRY span for " + "sid=%r: %s", + sid, + exc, + exc_info=True, + ) + return + + try: + _set_common(entry, "ENTRY") + entry.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "enter") + if sid: + entry.set_attribute(GEN_AI_SESSION_ID, sid) + entry.set_attribute(GEN_AI_CONVERSATION_ID, sid) + if agent_class: + entry.set_attribute(OH_AGENT_NAME, agent_class) + if model: + entry.set_attribute(GEN_AI_REQUEST_MODEL, safe_str(model)) + state = safe_get_attr(controller, "state") + entry_input_messages, _ = _entry_io_from_state(state) + if entry_input_messages: + _set_io( + entry, + input_value=entry_input_messages, + input_messages=entry_input_messages, + ) + except Exception as exc: + logger.debug("OpenHands instrumentation: ENTRY attr setup: %s", exc) + + entry_ctx = set_span_in_context(entry) + + # ----- AGENT (child of ENTRY) ----- + # Pass ``context=entry_ctx`` *explicitly* so AGENT inherits ENTRY + # as parent regardless of what the surrounding contextvars look + # like (some 3rd-party SDKs reset contextvars between calls). + try: + agent_span = tracer.start_span( + f"invoke_agent {agent_name}", + kind=SpanKind.INTERNAL, + context=entry_ctx, + ) + except Exception as exc: + logger.error( + "OpenHands instrumentation: failed to start AGENT span for " + "sid=%r: %s", + sid, + exc, + exc_info=True, + ) + if entry is not None: + try: + entry.end() + except Exception: + pass + return + + try: + _set_common(agent_span, "AGENT") + agent_span.set_attribute( + GenAI.GEN_AI_OPERATION_NAME, + GenAI.GenAiOperationNameValues.INVOKE_AGENT.value, + ) + agent_span.set_attribute(GenAI.GEN_AI_AGENT_NAME, safe_str(agent_name)) + if agent_class: + agent_span.set_attribute(OH_AGENT_NAME, agent_class) + if sid: + agent_span.set_attribute(GEN_AI_SESSION_ID, sid) + agent_span.set_attribute(GEN_AI_CONVERSATION_ID, sid) + agent_span.set_attribute(GEN_AI_AGENT_ID, sid) + if model: + agent_span.set_attribute(GEN_AI_REQUEST_MODEL, safe_str(model)) + except Exception as exc: + logger.debug("OpenHands instrumentation: AGENT attr setup: %s", exc) + + # Tool registry + gen_ai.tool.definitions — same logic as + # RunAgentUntilDoneWrapper, since this path also needs the + # registry for downstream TOOL spans. + try: + tools = safe_get_attr(agent, "tools") or [] + if sid: + store_tool_registry(sid, tools) + defs_summary: list[dict[str, Any]] = [] + for t in tools: + if isinstance(t, dict): + kind = t.get("type") or "function" + fn = t.get("function") or {} + name = fn.get("name") if isinstance(fn, dict) else None + else: + kind = safe_get_attr(t, "type") or "function" + fn = safe_get_attr(t, "function") + name = safe_get_attr(fn, "name") + if not name: + continue + item: dict[str, Any] = {"type": safe_str(kind), "name": safe_str(name)} + if isinstance(fn, dict): + desc = fn.get("description") + params = fn.get("parameters") + else: + desc = safe_get_attr(fn, "description") + params = safe_get_attr(fn, "parameters") + if desc: + item["description"] = safe_str(desc) + if params: + item["parameters"] = params + defs_summary.append(item) + if defs_summary: + agent_span.set_attribute( + GEN_AI_TOOL_DEFINITIONS, to_json_str(defs_summary) + ) + except Exception: + pass + + # Best-effort INPUT + system_instructions capture on AGENT at open + # time. ``_capture_agent_io_attributes`` will run again at close to + # overwrite these with the *final* state, but having them now means + # an in-flight read of the AGENT span (e.g. live dashboards) sees + # at least the system prompt + initial user message. + try: + state = safe_get_attr(controller, "state") + _capture_agent_io_attributes(agent_span, controller, agent, state) + except Exception as exc: + logger.debug( + "OpenHands instrumentation: AGENT initial I/O capture: %s", exc + ) + + agent_ctx = set_span_in_context(agent_span) + if sid: + # Stash ctx-with-AGENT so STEP / TOOL re-attach correctly even + # when fired from worker threads with brand-new asyncio loops. + # The downstream consumers (STEP / TOOL / LLM bridge) all do + # their own paired attach/detach, so it's safe to share this + # ``Context`` object across asyncio tasks and threads. + store_context(sid, agent_ctx) + + # ----- WARMUP STEP (round 1) ----- + # Open right after AGENT so any pre-_step actions (RECALL, etc.) that + # the controller dispatches to the runtime become children of STEP 1 + # rather than dangling siblings under AGENT. The first real ``_step`` + # call detects this open STEP isn't yet "consumed" and reuses it + # (preserving the round number) so the LLM call + first LLM-driven + # tool also nest under STEP 1 — giving the trace tree: + # + # ENTRY > AGENT > STEP 1 > [RECALL, LLM, execute_bash] + # STEP 2 > [LLM, finish] + # ... + warmup_step_ctx: object | None = None + warmup_step_span: trace_api.Span | None = None + try: + warmup_step_span = tracer.start_span( + "react step", + kind=SpanKind.INTERNAL, + context=agent_ctx, + ) + _set_common(warmup_step_span, "STEP") + warmup_step_span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "react") + warmup_step_span.set_attribute(OH_REACT_ROUND, 1) + warmup_step_span.set_attribute( + GenAI.GEN_AI_AGENT_NAME, safe_str(agent_name) + ) + if sid: + warmup_step_span.set_attribute(GEN_AI_SESSION_ID, sid) + warmup_step_span.set_attribute(GEN_AI_CONVERSATION_ID, sid) + warmup_step_span.set_attribute(GEN_AI_AGENT_ID, sid) + warmup_step_ctx = set_span_in_context(warmup_step_span) + if sid and warmup_step_ctx is not None: + store_context(sid, warmup_step_ctx) + except Exception as exc: + logger.debug("Failed to open warmup STEP span: %s", exc) + warmup_step_span = None + + # Stash everything we need to tear down in close(). + try: + setattr(controller, _OWNS_FLAG, True) + setattr(controller, _ENTRY_SPAN_ATTR, entry) + setattr(controller, _AGENT_SPAN_ATTR, agent_span) + # Save the AGENT context so the STEP wrapper can restore the + # session stash to AGENT every time it closes a STEP — that way + # any TOOL fired between rounds re-attaches AGENT (not a closed + # STEP). + setattr(controller, _AGENT_CTX_ATTR, agent_ctx) + # Stash warmup STEP so the first real ``_step`` reuses it. + setattr(controller, _STEP_SPAN_ATTR, warmup_step_span) + setattr(controller, "_otel_oh_round", 1 if warmup_step_span is not None else 0) + setattr(controller, "_otel_oh_step_consumed", False) + except Exception: + # If we can't attach to the instance (slots, etc.), close the + # spans down so we don't leak them. + if warmup_step_span is not None: + try: + warmup_step_span.end() + except Exception: + pass + try: + agent_span.end() + except Exception: + pass + if entry is not None: + try: + entry.end() + except Exception: + pass + return + + # Log at INFO so the user can verify in their app logs that the + # ENTRY+AGENT spans were actually opened (and which trace/span IDs + # they got). When a user reports "no ENTRY span" in their backend, + # the first thing to check is whether this log line appeared. + try: + entry_sc = entry.get_span_context() if entry is not None else None + agent_sc = agent_span.get_span_context() + warmup_sc = ( + warmup_step_span.get_span_context() + if warmup_step_span is not None + else None + ) + logger.info( + "OpenHands instrumentation: opened ENTRY+AGENT for sid=%r " + "(trace_id=%032x entry_span=%016x agent_span=%016x " + "warmup_step=%s agent_name=%s model=%s)", + sid, + entry_sc.trace_id if entry_sc is not None else agent_sc.trace_id, + entry_sc.span_id if entry_sc is not None else 0, + agent_sc.span_id, + f"{warmup_sc.span_id:016x}" if warmup_sc is not None else "none", + agent_name, + model or "", + ) + except Exception: + pass + + +def _close_entry_and_agent_for_controller( + controller: Any, *, error: BaseException | None = None +) -> None: + """Tear down the ENTRY+AGENT spans previously opened for ``controller``. + + Also closes any STEP span left open from the last ``_step`` invocation + (STEP spans are intentionally persisted across the return of ``_step`` + so that thread-pooled TOOL / LLM calls fire as their children). + """ + if not getattr(controller, _OWNS_FLAG, False): + logger.debug( + "OpenHands instrumentation: close called on controller %s " + "without an open ENTRY/AGENT — nothing to do", + id(controller), + ) + return + sid = safe_str(safe_get_attr(controller, "id") or "") + agent = safe_get_attr(controller, "agent") + state = safe_get_attr(controller, "state") + entry_span: trace_api.Span | None = getattr(controller, _ENTRY_SPAN_ATTR, None) + agent_span: trace_api.Span | None = getattr(controller, _AGENT_SPAN_ATTR, None) + # Legacy slots — kept for back-compat with already-instrumented + # instances created before we stopped persisting attach-tokens. + # If they're set we simply ignore them (any detach attempt across + # asyncio task boundaries would raise ``ValueError`` in the Aliyun + # SDK; spans alone carry all the parentage info we need). + _ = getattr(controller, _AGENT_TOKEN_ATTR, None) + _ = getattr(controller, _ENTRY_TOKEN_ATTR, None) + + # Close any STEP span still hanging from the last round before tearing + # down AGENT/ENTRY. Restores the session stash to AGENT context so any + # in-flight TOOL re-attaches AGENT (not a closed STEP). + try: + _close_open_step(controller) + except Exception: + pass + + # Capture I/O attributes on the AGENT span before ending it. + if agent_span is not None: + try: + _capture_agent_io_attributes(agent_span, controller, agent, state) + except Exception: + pass + try: + history = safe_get_attr(state, "history") or [] + if isinstance(history, list): + agent_span.set_attribute(OH_HISTORY_LENGTH, len(history)) + agent_state = safe_get_attr(state, "agent_state") + if agent_state is not None: + agent_span.set_attribute( + OH_AGENT_STATE, + safe_get_attr(agent_state, "value") or safe_str(agent_state), + ) + except Exception: + pass + if error is not None: + try: + agent_span.record_exception(error) + agent_span.set_status( + Status(StatusCode.ERROR, type(error).__qualname__) + ) + except Exception: + pass + + # End AGENT (no detach — the token (if any) was attached in the + # ``__init__`` task's contextvars context and detaching here would + # cross a context boundary, raising ``ValueError`` in the Aliyun + # SDK. Legacy code may have set ``agent_token`` on older instances; + # we simply leave it alone — detaching is unnecessary because the + # span carries its own parentage and contextvars naturally unwind + # when the task that attached them exits). + if agent_span is not None: + try: + agent_span.end() + except Exception: + pass + + # Mirror the most-useful bits onto ENTRY before closing it. + if entry_span is not None: + try: + agent_state = safe_get_attr(state, "agent_state") + if agent_state is not None: + entry_span.set_attribute( + OH_AGENT_STATE, + safe_get_attr(agent_state, "value") or safe_str(agent_state), + ) + history = safe_get_attr(state, "history") or [] + if isinstance(history, list): + entry_span.set_attribute(OH_HISTORY_LENGTH, len(history)) + output_repr = _final_state_to_output(state) + entry_input_messages, entry_output_messages = _entry_io_from_state( + state + ) + if output_repr: + _set_io( + entry_span, + output_value=output_repr, + input_messages=entry_input_messages, + output_messages=entry_output_messages, + ) + elif entry_input_messages or entry_output_messages: + _set_io( + entry_span, + input_messages=entry_input_messages, + output_messages=entry_output_messages, + ) + except Exception: + pass + if error is not None: + try: + entry_span.record_exception(error) + entry_span.set_status( + Status(StatusCode.ERROR, type(error).__qualname__) + ) + except Exception: + pass + + # Same as AGENT: end the span; never touch a possibly-leftover token + # from an older instrumentation run. + if entry_span is not None: + try: + entry_span.end() + except Exception: + pass + + # Mirror the open-time INFO log so the user can confirm the spans + # actually closed and exported. + try: + agent_sc = ( + agent_span.get_span_context() if agent_span is not None else None + ) + entry_sc = ( + entry_span.get_span_context() if entry_span is not None else None + ) + logger.info( + "OpenHands instrumentation: closed ENTRY+AGENT for sid=%r " + "(entry_span=%s agent_span=%s rounds=%s error=%s)", + sid, + f"{entry_sc.span_id:016x}" if entry_sc is not None else "none", + f"{agent_sc.span_id:016x}" if agent_sc is not None else "none", + getattr(controller, "_otel_oh_round", 0), + type(error).__qualname__ if error is not None else "none", + ) + except Exception: + pass + + if sid: + try: + clear_context(sid) + except Exception: + pass + + # Wipe stash slots so a re-used controller instance doesn't double-emit. + for attr in ( + _OWNS_FLAG, + _ENTRY_SPAN_ATTR, + _AGENT_SPAN_ATTR, + _ENTRY_TOKEN_ATTR, + _AGENT_TOKEN_ATTR, + _STEP_SPAN_ATTR, + _AGENT_CTX_ATTR, + "_otel_oh_step_consumed", + "_otel_oh_round", + ): + try: + setattr(controller, attr, None) + except Exception: + pass + try: + setattr(controller, _OWNS_FLAG, False) + except Exception: + pass + + +class AgentControllerInitWrapper: + """Open ENTRY + AGENT spans at the end of ``AgentController.__init__``. + + Always reliable under ``python -m openhands.core.main`` because it + hooks a class method (immune to from-import binding). + """ + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + try: + result = wrapped(*args, **kwargs) + except BaseException: + raise + try: + # Skip delegate sub-controllers — they shouldn't open another + # ENTRY span; they live within the parent controller's trace. + is_delegate = bool(safe_get_attr(instance, "is_delegate")) + if is_delegate: + logger.debug( + "OpenHands instrumentation: skipping delegate " + "controller %s for ENTRY/AGENT", + id(instance), + ) + else: + _open_entry_and_agent_for_controller(self._tracer, instance) + except Exception as exc: + # Promote to ERROR — if this fails the user will see "no + # ENTRY span" in their backend and we want a loud signal in + # the app logs to point at the cause. + logger.error( + "OpenHands instrumentation: AgentController init wrapper " + "failed to open ENTRY/AGENT for controller %s: %s", + id(instance), + exc, + exc_info=True, + ) + return result + + +class AgentControllerCloseWrapper: + """End the ENTRY + AGENT spans previously opened in ``__init__``.""" + + __slots__ = () + + def __init__(self, _tracer: Tracer): + # Tracer arg unused (we only need the spans we previously opened) + # but kept for symmetry with the other factories. + pass + + def __call__(self, wrapped, instance, args, kwargs): + return self._impl(wrapped, instance, args, kwargs) + + async def _impl(self, wrapped, instance, args, kwargs): + err: BaseException | None = None + try: + return await wrapped(*args, **kwargs) + except BaseException as exc: + err = exc + raise + finally: + try: + _close_entry_and_agent_for_controller(instance, error=err) + except Exception as exc: + logger.error( + "OpenHands instrumentation: AgentController close " + "wrapper failed to end spans for controller %s: %s", + id(instance), + exc, + exc_info=True, + ) + + +# --------------------------------------------------------------------------- +# LLM context bridge: openhands.llm.llm.LLM.__init__ +# --------------------------------------------------------------------------- + + +# Sentinel used to mark already-bridged completion callables so we don't +# wrap them more than once if ``LLM.__init__`` runs again on the same +# completion partial (e.g. live config reload). +_LLM_BRIDGE_FLAG = "_otel_oh_ctx_bridged" + + +class LLMInitWrapper: + """Make sure ``LLM.completion`` runs with the current STEP context attached. + + Why this exists + --------------- + The LLM call inside ``AgentController._step`` is synchronous and *should* + inherit our STEP context via ``contextvars`` — but in real OpenHands + deployments LiteLLM ends up creating its span with a *different* + ``trace_id`` than the surrounding STEP/AGENT/ENTRY tree. Two known ways + that can happen: + + * a 3rd-party auto-instrumentation injected before ours stashes the + LLM call onto a thread-pool worker (no contextvars propagation); + * the call is made from outside any of our wrappers (e.g. a condenser + / summarizer worker) where no OTel context is current. + + The fix: at the end of ``LLM.__init__`` we monkey-patch ``self._completion`` + with a tiny shim that re-attaches the latest sid-stashed context (which, + while a STEP is open, is the STEP context — see ``AgentControllerStepWrapper``). + The downstream ``opentelemetry-instrumentation-litellm`` (or the Aliyun + GenAI auto-instrumentation) will then create the LLM span as a child + of STEP and the ``trace_id`` finally lines up. + """ + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + # Tracer arg unused — we only re-attach an existing OTel context + # so the *real* LLM instrumentor (litellm / aliyun) emits the + # span under it. We don't create our own LLM span here. + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + result = wrapped(*args, **kwargs) + try: + self._patch_completion(instance) + except Exception as exc: + logger.debug("LLM init wrapper failed to bridge completion: %s", exc) + return result + + @staticmethod + def _patch_completion(instance: Any) -> None: + completion = getattr(instance, "_completion", None) + if completion is None: + return + if getattr(completion, _LLM_BRIDGE_FLAG, False): + return + + def bridged(*a: Any, **kw: Any) -> Any: + # ``AttachedSession(None)`` re-attaches whatever context the + # most recent v0 wrapper stashed (STEP if a step is open, + # AGENT otherwise). When no OpenHands session is active the + # context manager is a no-op. + with AttachedSession(None): + return completion(*a, **kw) + + try: + setattr(bridged, _LLM_BRIDGE_FLAG, True) + except Exception: + pass + try: + instance._completion = bridged + except Exception: + return + # Mirror onto the unwrapped slot too — some OpenHands codepaths + # call ``_completion_unwrapped`` directly when retries are + # disabled, and we want them to inherit the same parent context. + unwrapped = getattr(instance, "_completion_unwrapped", None) + if unwrapped is not None and not getattr(unwrapped, _LLM_BRIDGE_FLAG, False): + + def bridged_unwrapped(*a: Any, **kw: Any) -> Any: + with AttachedSession(None): + return unwrapped(*a, **kw) + + try: + setattr(bridged_unwrapped, _LLM_BRIDGE_FLAG, True) + except Exception: + pass + try: + instance._completion_unwrapped = bridged_unwrapped + except Exception: + pass diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/package.py new file mode 100644 index 000000000..6e3b6b925 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/package.py @@ -0,0 +1 @@ +_instruments = ("openhands-ai >= 1.0.0",) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/version.py new file mode 100644 index 000000000..3dc1f76bc --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/version.py @@ -0,0 +1 @@ +__version__ = "0.1.0" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/test-requirements.txt b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/test-requirements.txt new file mode 100644 index 000000000..b5c521bd2 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/test-requirements.txt @@ -0,0 +1,9 @@ +pytest>=7.0.0 +pytest-asyncio>=0.21.0 +wrapt>=1.0.0 +httpx>=0.24.0 + +-e ./instrumentation-loongsuite/loongsuite-instrumentation-openhands +-e ./opentelemetry-instrumentation +-e ./opentelemetry-sdk +-e ./opentelemetry-semantic-conventions diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/conftest.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/conftest.py new file mode 100644 index 000000000..685e33b35 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/conftest.py @@ -0,0 +1,244 @@ +"""Shared pytest fixtures and stub modules for the OpenHands instrumentation. + +We deliberately don't require ``openhands-ai`` to be installed at test time: +instead we register lightweight stub modules under the same dotted paths so +``wrap_function_wrapper`` can patch them. The wrappers themselves only rely on +the *call signatures* documented in ``execute.md`` — which we faithfully +reproduce in the stubs. +""" + +from __future__ import annotations + +import sys +import types +from dataclasses import dataclass, field + +import pytest +from opentelemetry import trace as trace_api +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import SimpleSpanProcessor +from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( + InMemorySpanExporter, +) + + +def _ensure_stub_module(name: str) -> types.ModuleType: + if name in sys.modules: + return sys.modules[name] + mod = types.ModuleType(name) + sys.modules[name] = mod + parent_name, _, leaf = name.rpartition(".") + if parent_name: + parent = _ensure_stub_module(parent_name) + setattr(parent, leaf, mod) + return mod + + +def _install_v0_stub_modules() -> None: + """Stubs for the V0 (Legacy CodeAct) hook points.""" + _ensure_stub_module("openhands") + core = _ensure_stub_module("openhands.core") + main_mod = _ensure_stub_module("openhands.core.main") + loop_mod = _ensure_stub_module("openhands.core.loop") + ctrl_pkg = _ensure_stub_module("openhands.controller") + ctrl_mod = _ensure_stub_module("openhands.controller.agent_controller") + rt_pkg = _ensure_stub_module("openhands.runtime") + rt_base = _ensure_stub_module("openhands.runtime.base") + + @dataclass + class _AgentState: + value: str = "finished" + + @dataclass + class _State: + agent_state: _AgentState = field(default_factory=_AgentState) + + @dataclass + class _LLMConfig: + model: str = "qwen3-coder-plus" + + @dataclass + class _LLM: + config: _LLMConfig = field(default_factory=_LLMConfig) + + @dataclass + class _Agent: + name: str = "CodeActAgent" + llm: _LLM = field(default_factory=_LLM) + # Mirrors litellm ChatCompletionToolParam dicts as produced by + # openhands.agenthub.codeact_agent.codeact_agent.CodeActAgent._get_tools. + tools: list = field( + default_factory=lambda: [ + { + "type": "function", + "function": { + "name": "execute_bash", + "description": "Run a bash command on the runtime sandbox.", + "parameters": { + "type": "object", + "properties": { + "command": {"type": "string"}, + }, + "required": ["command"], + }, + }, + }, + ] + ) + + class AgentController: + step_calls = 0 + close_calls = 0 + + def __init__(self, agent=None, sid="sid-test"): + self.agent = agent or _Agent() + self.id = sid + self.state = _State() + self._pending_action = None + self.is_delegate = False + + async def _step(self) -> None: + type(self).step_calls += 1 + class _Pending: + action = "run" + command = "echo step" + thought = "trying" + + self._pending_action = _Pending() + + async def close(self, set_stop_state: bool = True) -> None: + type(self).close_calls += 1 + + ctrl_mod.AgentController = AgentController + + class _ToolCallMetadata: + """Stand-in for :class:`openhands.events.tool.ToolCallMetadata`.""" + + def __init__(self, function_name="", tool_call_id="", arguments=None): + import json as _json + + self.function_name = function_name + self.tool_call_id = tool_call_id + + class _Fn: + def __init__(self, name, args): + self.name = name + self.arguments = _json.dumps(args or {}) + + class _TC: + def __init__(self, tcid, fn): + self.id = tcid + self.function = fn + + class _Msg: + def __init__(self, tcs): + self.tool_calls = tcs + + class _Choice: + def __init__(self, msg): + self.message = msg + + class _ModelResp: + def __init__(self, choices): + self.choices = choices + + self.model_response = _ModelResp( + [_Choice(_Msg([_TC(tool_call_id, _Fn(function_name, arguments))]))] + ) + + class _Action: + def __init__( + self, + action_type="run", + command="echo hi", + tool_call_metadata=None, + ): + self.action = action_type + self.command = command + self.tool_call_metadata = tool_call_metadata + + class _Observation: + def __init__(self, exit_code=0, content=""): + self.exit_code = exit_code + self.content = content + self.observation = "run" + + class Runtime: + run_action_calls = 0 + # Tests can override on the instance to drive observation values. + _next_observation: _Observation | None = None + + def __init__(self, sid="sid-test"): + self.sid = sid + + def run_action(self, action) -> _Observation: + type(self).run_action_calls += 1 + obs = self._next_observation + if obs is not None: + self._next_observation = None + return obs + return _Observation(exit_code=0) + + rt_base.Runtime = Runtime + rt_base.Action = _Action + rt_base.Observation = _Observation + rt_base.ToolCallMetadata = _ToolCallMetadata + + @dataclass + class _State2: + agent_state: _AgentState = field(default_factory=lambda: _AgentState("finished")) + + async def run_controller( + config=None, + initial_user_action=None, + sid: str | None = None, + **kwargs, + ): + # Mirror real V0: invoke the agent loop *inside* run_controller so + # the AGENT span lives within the ENTRY span (and inherits its + # stashed OTel context). Tests can install + # ``main_mod._test_inner_args = (controller, runtime)`` to opt in. + inner_args = getattr(main_mod, "_test_inner_args", None) + if inner_args is not None: + controller, runtime = inner_args + await loop_mod.run_agent_until_done(controller, runtime, None, []) + return _State2() + + main_mod.run_controller = run_controller + + async def run_agent_until_done(controller, runtime, memory, end_states): + # Tests can install a custom inner callback to drive STEP / TOOL + # spans inside the AGENT span; default is a no-op. + cb = getattr(loop_mod, "_test_inner_callback", None) + if callable(cb): + await cb(controller, runtime) + return None + + loop_mod.run_agent_until_done = run_agent_until_done + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def tracer_provider() -> TracerProvider: + provider = TracerProvider() + exporter = InMemorySpanExporter() + provider.add_span_processor(SimpleSpanProcessor(exporter)) + provider._exporter = exporter # type: ignore[attr-defined] + return provider + + +@pytest.fixture +def stub_openhands_v0_modules() -> None: + _install_v0_stub_modules() + + +@pytest.fixture(autouse=True) +def _reset_global_tracer(): + """Avoid bleed-through of the SDK provider between tests.""" + yield + trace_api._TRACER_PROVIDER = None # type: ignore[attr-defined] + diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_tool_attributes.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_tool_attributes.py new file mode 100644 index 000000000..d94ce16bc --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_tool_attributes.py @@ -0,0 +1,201 @@ +"""ARMS GenAI semconv §Tool conformance tests for the V0 TOOL wrapper. + +I/O capture is always on (no env-var gating, no truncation), so the +TOOL span must carry every attribute the spec calls out — both +required and recommended — on every run. +""" + +from __future__ import annotations + +import asyncio +import json + +import pytest + + +def _spans_by_kind(exporter, kind: str): + return [ + s + for s in exporter.get_finished_spans() + if s.attributes.get("gen_ai.span.kind") == kind + ] + + +@pytest.fixture +def instrumented(tracer_provider, stub_openhands_v0_modules): + from opentelemetry.instrumentation.openhands import OpenHandsInstrumentor + from opentelemetry.instrumentation.openhands.internal import session_context + + session_context.clear_all() + inst = OpenHandsInstrumentor() + inst.instrument(tracer_provider=tracer_provider, skip_dep_check=True) + try: + yield inst, tracer_provider._exporter # type: ignore[attr-defined] + finally: + try: + inst.uninstrument() + except Exception: + pass + session_context.clear_all() + + +def _run_one_tool_call(rt_base, ctrl_mod, loop_mod, main_mod): + """Drive a single ENTRY → AGENT → STEP → TOOL flow.""" + ctrl = ctrl_mod.AgentController(sid="tool-sid") + runtime = rt_base.Runtime(sid="tool-sid") + + tcm = rt_base.ToolCallMetadata( + function_name="execute_bash", + tool_call_id="call_abc123", + arguments={"command": "ls /tmp", "thought": "list temp"}, + ) + action = rt_base.Action( + action_type="run", + command="ls /tmp", + tool_call_metadata=tcm, + ) + + class MessageAction: + content = "list /tmp" + source = "user" + + async def _inner(_c, _r): + await ctrl._step() + runtime.run_action(action) + + loop_mod._test_inner_callback = _inner + main_mod._test_inner_args = (ctrl, runtime) + + async def _scenario(): + await main_mod.run_controller( + config=None, + initial_user_action=MessageAction(), + sid="tool-sid", + ) + + try: + asyncio.run(_scenario()) + finally: + loop_mod._test_inner_callback = None + main_mod._test_inner_args = None + + +def test_tool_span_carries_all_arms_required_attributes(instrumented): + inst, exporter = instrumented + + import openhands.controller.agent_controller as ctrl_mod + import openhands.core.loop as loop_mod + import openhands.core.main as main_mod + import openhands.runtime.base as rt_base + + _run_one_tool_call(rt_base, ctrl_mod, loop_mod, main_mod) + + tools = _spans_by_kind(exporter, "TOOL") + assert len(tools) == 1 + tool = tools[0] + attrs = tool.attributes + + # Required + assert attrs["gen_ai.span.kind"] == "TOOL" + assert attrs["gen_ai.operation.name"] == "execute_tool" + + # Span name should be `execute_tool {tool_name}` + assert tool.name == "execute_tool execute_bash" + + # Recommended attributes + assert attrs["gen_ai.tool.name"] == "execute_bash" + assert attrs["gen_ai.tool.type"] == "function" + assert attrs["gen_ai.tool.call.id"] == "call_abc123" + assert attrs.get("gen_ai.tool.description") == ( + "Run a bash command on the runtime sandbox." + ) + + # Arguments should be the BARE JSON dict, not the wrapping + # {"tool": ..., "arguments": ...} envelope. + args_json = attrs.get("gen_ai.tool.call.arguments") + assert args_json is not None + args = json.loads(args_json) + assert args == {"command": "ls /tmp", "thought": "list temp"} + + # Result should reflect the observation. + result_json = attrs.get("gen_ai.tool.call.result") + assert result_json is not None + result = json.loads(result_json) + assert result.get("exit_code") == 0 + assert "observation" in result + + +def test_tool_span_falls_back_to_action_field_when_no_tool_call_metadata( + instrumented, +): + """If the action wasn't generated from an LLM tool call (e.g. a + user-initiated agent.action), the wrapper should still produce a + sensible ``gen_ai.tool.name`` derived from the action type.""" + inst, exporter = instrumented + + import openhands.controller.agent_controller as ctrl_mod + import openhands.core.loop as loop_mod + import openhands.core.main as main_mod + import openhands.runtime.base as rt_base + + ctrl = ctrl_mod.AgentController(sid="tool-fallback-sid") + runtime = rt_base.Runtime(sid="tool-fallback-sid") + action = rt_base.Action(action_type="run", command="echo hi") + + class MessageAction: + content = "say hi" + source = "user" + + async def _inner(_c, _r): + await ctrl._step() + runtime.run_action(action) + + loop_mod._test_inner_callback = _inner + main_mod._test_inner_args = (ctrl, runtime) + + async def _scenario(): + await main_mod.run_controller( + config=None, + initial_user_action=MessageAction(), + sid="tool-fallback-sid", + ) + + try: + asyncio.run(_scenario()) + finally: + loop_mod._test_inner_callback = None + main_mod._test_inner_args = None + + tool = _spans_by_kind(exporter, "TOOL")[0] + attrs = tool.attributes + + # Action.action == "run" → tool name "bash" + assert attrs["gen_ai.tool.name"] == "bash" + assert tool.name == "execute_tool bash" + # No tool-call id when the action wasn't from an LLM call + assert attrs.get("gen_ai.tool.call.id", "") == "" + # Arguments still produced from the action's fields + args = json.loads(attrs["gen_ai.tool.call.arguments"]) + assert args.get("command") == "echo hi" + + +def test_agent_span_emits_tool_definitions(instrumented): + """AGENT span should advertise the agent's available tools per the + ARMS GenAI semconv §Agent → ``gen_ai.tool.definitions``.""" + inst, exporter = instrumented + + import openhands.controller.agent_controller as ctrl_mod + import openhands.core.loop as loop_mod + import openhands.core.main as main_mod + import openhands.runtime.base as rt_base + + _run_one_tool_call(rt_base, ctrl_mod, loop_mod, main_mod) + + agent = _spans_by_kind(exporter, "AGENT")[0] + defs_json = agent.attributes.get("gen_ai.tool.definitions") + assert defs_json, "AGENT span should set gen_ai.tool.definitions" + defs = json.loads(defs_json) + assert isinstance(defs, list) and defs + assert defs[0]["type"] == "function" + assert defs[0]["name"] == "execute_bash" + assert "description" in defs[0] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_trace_continuity.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_trace_continuity.py new file mode 100644 index 000000000..9025f9991 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_trace_continuity.py @@ -0,0 +1,246 @@ +"""Cross-thread / cross-loop trace continuity tests for V0 wrappers. + +These tests model the *real* OpenHands V0 runtime behaviour: events are +delivered by ``EventStream`` via a ``ThreadPoolExecutor`` and the controller +processes them with ``asyncio.get_event_loop().run_until_complete(...)`` — +which spins a brand-new asyncio loop in the worker thread. Without our +session-context bridge, STEP / TOOL spans would start fresh root traces. + +We assert: + +* All ENTRY / AGENT / STEP / TOOL spans share the **same** ``trace_id``. +* Parent-child wiring is correct (STEP is parented under AGENT, TOOL too). +* The session-context store is cleaned up after the entry returns. +* GenAI semantic-convention I/O attributes are populated when content + capture is enabled. +""" + +from __future__ import annotations + +import asyncio +import os +import threading +from concurrent.futures import ThreadPoolExecutor + +import pytest + + +def _spans_by_kind_attr(exporter, kind: str): + return [ + s + for s in exporter.get_finished_spans() + if s.attributes.get("gen_ai.span.kind") == kind + ] + + +@pytest.fixture +def instrumented_v0(tracer_provider, stub_openhands_v0_modules): + from opentelemetry.instrumentation.openhands import OpenHandsInstrumentor + from opentelemetry.instrumentation.openhands.internal import session_context + + session_context.clear_all() + inst = OpenHandsInstrumentor() + inst.instrument(tracer_provider=tracer_provider, skip_dep_check=True) + try: + yield inst, tracer_provider._exporter # type: ignore[attr-defined] + finally: + try: + inst.uninstrument() + except Exception: + pass + session_context.clear_all() + + +def _drive_step_in_worker_thread(controller, runtime, action) -> None: + """Reproduce the V0 EventStream → ThreadPoolExecutor → run_until_complete path. + + The worker thread (a) has no shared asyncio loop with the caller and + (b) has a *fresh* ``contextvars.Context`` (Python copies the snapshot + at submit-time, but the snapshot is from this test thread — the same + fresh context the real EventStream queue thread would have). + """ + barrier = threading.Event() + err: list[BaseException] = [] + + def _worker(): + try: + # New event loop per worker — exactly what V0 does. + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + loop.run_until_complete(controller._step()) + # Run_action is sync — call it directly inside the worker. + runtime.run_action(action) + finally: + loop.close() + except BaseException as exc: # pragma: no cover - surfaced via err + err.append(exc) + finally: + barrier.set() + + pool = ThreadPoolExecutor(max_workers=1) + fut = pool.submit(_worker) + fut.result(timeout=5) + pool.shutdown(wait=True) + barrier.wait(timeout=5) + if err: + raise err[0] + + +def test_all_spans_share_one_trace_id_across_threads(instrumented_v0): + """The whole V0 trace must collapse onto a single trace_id even when + STEP / TOOL run in fresh worker threads with fresh asyncio loops.""" + inst, exporter = instrumented_v0 + + import openhands.controller.agent_controller as ctrl_mod + import openhands.core.loop as loop_mod + import openhands.core.main as main_mod + import openhands.runtime.base as rt_base + + ctrl = ctrl_mod.AgentController(sid="bench-001") + runtime = rt_base.Runtime(sid="bench-001") + action = rt_base.Action(action_type="run", command="ls /") + + async def _inner(_controller, _runtime): + for _ in range(2): + _drive_step_in_worker_thread(ctrl, runtime, action) + + loop_mod._test_inner_callback = _inner + main_mod._test_inner_args = (ctrl, runtime) + + class MessageAction: + content = "say hi" + source = "user" + + async def _scenario(): + await main_mod.run_controller( + config=None, + initial_user_action=MessageAction(), + sid="bench-001", + ) + + try: + asyncio.run(_scenario()) + finally: + loop_mod._test_inner_callback = None + main_mod._test_inner_args = None + + spans = exporter.get_finished_spans() + by_kind = {kind: _spans_by_kind_attr(exporter, kind) for kind in ("ENTRY", "AGENT", "STEP", "TOOL")} + + assert len(by_kind["ENTRY"]) == 1 + assert len(by_kind["AGENT"]) == 1 + assert len(by_kind["STEP"]) == 2 + assert len(by_kind["TOOL"]) == 2 + + entry = by_kind["ENTRY"][0] + agent = by_kind["AGENT"][0] + trace_id = entry.context.trace_id + + # Same trace_id for every span + for s in spans: + assert s.context.trace_id == trace_id, ( + f"span {s.name!r} (kind={s.attributes.get('gen_ai.span.kind')}) " + f"has trace_id {s.context.trace_id} but expected {trace_id}" + ) + + # Parent-child links: AGENT under ENTRY, STEP under AGENT, TOOL under AGENT + assert agent.parent is not None and agent.parent.span_id == entry.context.span_id + for s in by_kind["STEP"]: + assert s.parent is not None and s.parent.span_id == agent.context.span_id + for t in by_kind["TOOL"]: + assert t.parent is not None and t.parent.span_id == agent.context.span_id + + +def test_session_context_cleared_after_entry(instrumented_v0): + """The per-sid stash must not leak across runs.""" + inst, exporter = instrumented_v0 + + import openhands.core.loop as loop_mod + import openhands.core.main as main_mod + from opentelemetry.instrumentation.openhands.internal import session_context + + async def _scenario(): + await main_mod.run_controller( + config=None, + initial_user_action=type("Msg", (), {"content": "x", "source": "user"})(), + sid="ephemeral-sid", + ) + + asyncio.run(_scenario()) + assert session_context.get_context("ephemeral-sid") is None + + +def test_io_attributes_on_entry_agent_step(instrumented_v0): + """Verify GenAI / OpenInference I/O attributes are populated.""" + inst, exporter = instrumented_v0 + + import openhands.controller.agent_controller as ctrl_mod + import openhands.core.loop as loop_mod + import openhands.core.main as main_mod + import openhands.runtime.base as rt_base + + ctrl = ctrl_mod.AgentController(sid="io-sid") + runtime = rt_base.Runtime(sid="io-sid") + action = rt_base.Action(action_type="run", command="cat /etc/hosts") + + # Seed history with a *MessageAction*-named instance — that's the type + # name the AGENT wrapper looks for when computing input.messages. + class MessageAction: + content = "do the thing" + source = "user" + + ctrl.state.history = [MessageAction()] + + async def _inner(_c, _r): + await ctrl._step() + runtime.run_action(action) + + loop_mod._test_inner_callback = _inner + main_mod._test_inner_args = (ctrl, runtime) + + async def _scenario(): + await main_mod.run_controller( + config=None, + initial_user_action=MessageAction(), + sid="io-sid", + ) + + try: + asyncio.run(_scenario()) + finally: + loop_mod._test_inner_callback = None + main_mod._test_inner_args = None + + entry = _spans_by_kind_attr(exporter, "ENTRY")[0] + agent = _spans_by_kind_attr(exporter, "AGENT")[0] + step = _spans_by_kind_attr(exporter, "STEP")[0] + tool = _spans_by_kind_attr(exporter, "TOOL")[0] + + # ENTRY + assert entry.attributes.get("gen_ai.framework") == "openhands" + assert entry.attributes.get("gen_ai.system") == "openhands" + assert entry.attributes.get("gen_ai.session.id") == "io-sid" + assert entry.attributes.get("input.value") + assert "do the thing" in entry.attributes.get("input.value") + + # AGENT + assert agent.attributes.get("gen_ai.input.messages") + assert "do the thing" in agent.attributes.get("gen_ai.input.messages") + assert agent.attributes.get("input.value") + assert agent.attributes.get("gen_ai.session.id") == "io-sid" + + # STEP + assert step.attributes.get("input.value") + assert step.attributes.get("output.value") + assert step.attributes.get("gen_ai.output.messages") + assert step.attributes.get("openhands.action.type") == "run" + out = step.attributes.get("output.value") + assert "tool_calls" in out and "echo step" in out + + # TOOL + assert tool.attributes.get("gen_ai.tool.name") == "bash" + assert tool.attributes.get("input.value") + assert "cat /etc/hosts" in tool.attributes.get("input.value") + assert tool.attributes.get("output.value") + assert "exit_code" in tool.attributes.get("output.value") diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_wrappers.py new file mode 100644 index 000000000..18dda9a55 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_wrappers.py @@ -0,0 +1,161 @@ +"""Tests for V0 (Legacy CodeAct) wrappers. + +We exercise the four V0 patches (``run_controller``, ``run_agent_until_done``, +``AgentController._step``, ``Runtime.run_action``) and assert that: + +* The ``ENTRY → AGENT → STEP → TOOL`` span tree is produced. +* Parent-child linkage is correct. +* Per-action ``gen_ai.tool.name`` is mapped from the V0 ``action`` field. +""" + +from __future__ import annotations + +import asyncio + +import pytest + + +def _spans_by_kind_attr(exporter, kind: str): + return [ + s + for s in exporter.get_finished_spans() + if s.attributes.get("gen_ai.span.kind") == kind + ] + + +@pytest.fixture +def instrumented_v0(tracer_provider, stub_openhands_v0_modules): + from opentelemetry.instrumentation.openhands import OpenHandsInstrumentor + + inst = OpenHandsInstrumentor() + inst.instrument(tracer_provider=tracer_provider, skip_dep_check=True) + try: + yield inst, tracer_provider._exporter # type: ignore[attr-defined] + finally: + try: + inst.uninstrument() + except Exception: + pass + + +def test_v0_full_span_tree(instrumented_v0): + inst, exporter = instrumented_v0 + + import openhands.controller.agent_controller as ctrl_mod + import openhands.core.loop as loop_mod + import openhands.core.main as main_mod + import openhands.runtime.base as rt_base + + ctrl = ctrl_mod.AgentController() + runtime = rt_base.Runtime() + action = rt_base.Action(action_type="run", command="ls /") + + async def _inner(controller, _runtime): + for _ in range(2): + await ctrl._step() + runtime.run_action(action) + + loop_mod._test_inner_callback = _inner + + async def _scenario(): + # ENTRY span via run_controller wrapper + await main_mod.run_controller( + config=None, + initial_user_action=type("Msg", (), {"content": "hello"})(), + sid="sid-test", + ) + # AGENT span via run_agent_until_done wrapper (which calls _inner) + await loop_mod.run_agent_until_done(ctrl, runtime, None, []) + + try: + asyncio.run(_scenario()) + finally: + loop_mod._test_inner_callback = None + + entry = _spans_by_kind_attr(exporter, "ENTRY") + agent = _spans_by_kind_attr(exporter, "AGENT") + step = _spans_by_kind_attr(exporter, "STEP") + tool = _spans_by_kind_attr(exporter, "TOOL") + + assert len(entry) == 1, f"unexpected ENTRY count: {len(entry)}" + assert len(agent) == 1, f"unexpected AGENT count: {len(agent)}" + assert len(step) == 2, f"unexpected STEP count: {len(step)}" + assert len(tool) == 2, f"unexpected TOOL count: {len(tool)}" + + e = entry[0] + a = agent[0] + assert e.name == "enter openhands" + assert e.attributes.get("gen_ai.framework") == "openhands" + assert e.attributes.get("gen_ai.session.id") == "sid-test" + + assert a.name.startswith("invoke_agent ") + assert a.attributes.get("gen_ai.agent.name") == "CodeActAgent" + assert a.attributes.get("gen_ai.request.model") == "qwen3-coder-plus" + + # All STEP spans share the AGENT as parent. + for s in step: + assert s.parent is not None + assert s.parent.span_id == a.context.span_id + assert s.attributes.get("gen_ai.operation.name") == "react" + assert s.attributes.get("gen_ai.react.round") in (1, 2) + + # TOOL spans are siblings of STEP under AGENT (run_action is called after + # _step returns and is no longer in STEP context). + for t in tool: + assert t.attributes.get("gen_ai.tool.name") == "bash" + assert t.attributes.get("openhands.action.type") == "run" + assert t.attributes.get("openhands.action.exit_code") == 0 + + +def test_v0_step_round_increments_per_controller(instrumented_v0): + inst, exporter = instrumented_v0 + import openhands.controller.agent_controller as ctrl_mod + + ctrl_a = ctrl_mod.AgentController(sid="A") + ctrl_b = ctrl_mod.AgentController(sid="B") + + async def _go(): + await ctrl_a._step() + await ctrl_a._step() + await ctrl_b._step() + + asyncio.run(_go()) + + step_spans = _spans_by_kind_attr(exporter, "STEP") + assert len(step_spans) == 3 + rounds_a = sorted( + s.attributes.get("gen_ai.react.round") + for s in step_spans + if s.attributes.get("gen_ai.session.id") == "A" + ) + rounds_b = sorted( + s.attributes.get("gen_ai.react.round") + for s in step_spans + if s.attributes.get("gen_ai.session.id") == "B" + ) + assert rounds_a == [1, 2] + assert rounds_b == [1] + + +def test_v0_runtime_error_observation_marks_span(instrumented_v0): + inst, exporter = instrumented_v0 + import openhands.runtime.base as rt_base + + runtime = rt_base.Runtime() + + class _ErrAction: + action = "run" + command = "false" + + # Use the conftest hook to make the next run_action return an error obs. + err_obs = rt_base.Observation(exit_code=2) + runtime._next_observation = err_obs + + runtime.run_action(_ErrAction()) + + tool_spans = _spans_by_kind_attr(exporter, "TOOL") + assert len(tool_spans) == 1 + span = tool_spans[0] + assert span.attributes.get("openhands.action.exit_code") == 2 + assert span.status.status_code.name == "ERROR" + diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/README.md b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/README.md new file mode 100644 index 000000000..4d4f4d7b1 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/README.md @@ -0,0 +1,32 @@ +# LoongSuite slop-code-bench Instrumentation + +OpenTelemetry instrumentation for the [slop-code-bench](https://github.com/SprocketLab/slop-code-bench) benchmark orchestrator. + +## Span Tree + +``` +ENTRY "slop-code.enter" +└── CHAIN "workflow.{problem_name}" + ├── TASK "task.{checkpoint_name}" + │ └── AGENT "agent.{agent_type}" + │ ├── STEP "react.step.{N}" [MiniSWE only] + │ └── ... + ├── TASK "task.{checkpoint_name}" + │ └── AGENT "agent.{agent_type}" + └── ... +LLM "chat {model_name}" [Rubric Judge] +``` + +## Installation + +```bash +pip install loongsuite-instrumentation-slop-code +``` + +## Usage + +```python +from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor + +SlopCodeInstrumentor().instrument() +``` diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/pyproject.toml new file mode 100644 index 000000000..b443381c2 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/pyproject.toml @@ -0,0 +1,61 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "loongsuite-instrumentation-slop-code" +dynamic = ["version"] +description = "LoongSuite slop-code-bench instrumentation" +readme = "README.md" +license = "Apache-2.0" +requires-python = ">=3.10,<4" +authors = [ + { name = "Zhiyong Liu", email = "liuzhiyong.lzy@alibaba-inc.com" }, + { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +dependencies = [ + "opentelemetry-api >= 1.37.0", + "opentelemetry-instrumentation >= 0.58b0", + "opentelemetry-semantic-conventions >= 0.58b0", + "wrapt >= 1.14.0, < 2.0.0", + "opentelemetry-util-genai >= 0.3b0.dev0", +] + +[project.optional-dependencies] +instruments = [ + "slop-code-bench >= 0.1", +] +test = [ + "pytest", + "pytest-asyncio", + "pytest-forked", + "opentelemetry-sdk", +] + +[project.entry-points.opentelemetry_instrumentor] +slop_code = "opentelemetry.instrumentation.slop_code:SlopCodeInstrumentor" + +[project.urls] +Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-slop-code" +Repository = "https://github.com/alibaba/loongsuite-python-agent" + +[tool.hatch.version] +path = "src/opentelemetry/instrumentation/slop_code/version.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/opentelemetry"] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/__init__.py new file mode 100644 index 000000000..973cd969e --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/__init__.py @@ -0,0 +1,211 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +OpenTelemetry slop-code-bench Instrumentation + +Instruments the slop-code benchmark orchestrator lifecycle: +- ENTRY: run_agent (CLI entrypoint) +- CHAIN/workflow: run_agent_on_problem (per-problem) +- TASK: AgentRunner._run_checkpoint (per-checkpoint) +- AGENT: Agent.run_checkpoint (concrete agent invocation) +- STEP: MiniSWEAgent.agent_step (ReAct iteration) +- LLM: grade_file_async (Rubric Judge) +""" + +import logging +from typing import Any, Collection + +from wrapt import wrap_function_wrapper + +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor +from opentelemetry.instrumentation.slop_code.package import _instruments +from opentelemetry.instrumentation.slop_code.version import __version__ +from opentelemetry.instrumentation.slop_code.wrappers.agent import ( + _AgentRunCheckpointWrapper, +) +from opentelemetry.instrumentation.slop_code.wrappers.entry import ( + _EntryWrapper, +) +from opentelemetry.instrumentation.slop_code.wrappers.llm import ( + _RubricGradeWrapper, +) +from opentelemetry.instrumentation.slop_code.wrappers.step import ( + _MiniSWEStepWrapper, +) +from opentelemetry.instrumentation.slop_code.wrappers.task import ( + _TaskRunCheckpointWrapper, +) +from opentelemetry.instrumentation.slop_code.wrappers.workflow import ( + _WorkflowWrapper, +) +from opentelemetry.instrumentation.utils import unwrap + +logger = logging.getLogger(__name__) + +__all__ = ["SlopCodeInstrumentor", "__version__"] + +_MODULE_ENTRY = "slop_code.entrypoints.commands.run_agent" +_MODULE_WORKER = "slop_code.entrypoints.problem_runner.worker" +# slop_code.entrypoints.problem_runner.driver re-imports +# `run_agent_on_problem` via `from .worker import run_agent_on_problem` +# at package-load time, capturing the original function reference. Because +# our wrap happens after that bind, we must additionally replace the local +# binding inside `driver` itself, otherwise the worker subprocess still +# calls the un-wrapped original and the CHAIN span never fires. +_MODULE_DRIVER = "slop_code.entrypoints.problem_runner.driver" +_MODULE_RUNNER = "slop_code.agent_runner.runner" +_MODULE_AGENT = "slop_code.agent_runner.agent" +_MODULE_MINISWE = "slop_code.agent_runner.agents.miniswe" +_MODULE_RUBRIC = "slop_code.metrics.rubric.router" + + +class SlopCodeInstrumentor(BaseInstrumentor): + """OpenTelemetry instrumentor for slop-code-bench framework.""" + + def instrumentation_dependencies(self) -> Collection[str]: + return _instruments + + def _instrument(self, **kwargs: Any) -> None: + tracer_provider = kwargs.get("tracer_provider") + tracer = trace_api.get_tracer( + __name__, + __version__, + tracer_provider=tracer_provider, + ) + + # 3.1 ENTRY span: run_agent + try: + wrap_function_wrapper( + module=_MODULE_ENTRY, + name="run_agent", + wrapper=_EntryWrapper(tracer), + ) + except Exception as e: + logger.warning(f"Could not wrap run_agent: {e}") + + # 3.2 CHAIN span: run_agent_on_problem + workflow_wrapper = _WorkflowWrapper(tracer) + try: + wrap_function_wrapper( + module=_MODULE_WORKER, + name="run_agent_on_problem", + wrapper=workflow_wrapper, + ) + except Exception as e: + logger.warning(f"Could not wrap run_agent_on_problem: {e}") + # Also wrap the re-bound name inside driver. driver.py imports + # run_agent_on_problem at module-load time via `from .worker import ...`, + # so the local name escapes our worker-module patch. The worker + # subprocess inherits this stale reference via fork(), and CHAIN + # spans never fire unless we patch the local re-bind too. + try: + wrap_function_wrapper( + module=_MODULE_DRIVER, + name="run_agent_on_problem", + wrapper=workflow_wrapper, + ) + except Exception as e: + logger.warning(f"Could not wrap driver.run_agent_on_problem: {e}") + + # 3.3 TASK span: AgentRunner._run_checkpoint + try: + wrap_function_wrapper( + module=_MODULE_RUNNER, + name="AgentRunner._run_checkpoint", + wrapper=_TaskRunCheckpointWrapper(tracer), + ) + except Exception as e: + logger.warning(f"Could not wrap AgentRunner._run_checkpoint: {e}") + + # 3.4 AGENT span: Agent.run_checkpoint + try: + wrap_function_wrapper( + module=_MODULE_AGENT, + name="Agent.run_checkpoint", + wrapper=_AgentRunCheckpointWrapper(tracer), + ) + except Exception as e: + logger.warning(f"Could not wrap Agent.run_checkpoint: {e}") + + # 3.5 STEP span: MiniSWEAgent.agent_step + try: + wrap_function_wrapper( + module=_MODULE_MINISWE, + name="MiniSWEAgent.agent_step", + wrapper=_MiniSWEStepWrapper(tracer), + ) + except Exception as e: + logger.debug(f"Could not wrap MiniSWEAgent.agent_step: {e}") + + # 3.6 LLM span: grade_file_async + try: + wrap_function_wrapper( + module=_MODULE_RUBRIC, + name="grade_file_async", + wrapper=_RubricGradeWrapper(tracer), + ) + except Exception as e: + logger.debug(f"Could not wrap grade_file_async: {e}") + + def _uninstrument(self, **kwargs: Any) -> None: + try: + import slop_code.entrypoints.commands.run_agent as mod_entry + + unwrap(mod_entry, "run_agent") + except Exception: + pass + + try: + import slop_code.entrypoints.problem_runner.worker as mod_worker + + unwrap(mod_worker, "run_agent_on_problem") + except Exception: + pass + + try: + import slop_code.entrypoints.problem_runner.driver as mod_driver + + unwrap(mod_driver, "run_agent_on_problem") + except Exception: + pass + + try: + import slop_code.agent_runner.runner as mod_runner + + unwrap(mod_runner.AgentRunner, "_run_checkpoint") + except Exception: + pass + + try: + import slop_code.agent_runner.agent as mod_agent + + unwrap(mod_agent.Agent, "run_checkpoint") + except Exception: + pass + + try: + import slop_code.agent_runner.agents.miniswe as mod_miniswe + + unwrap(mod_miniswe.MiniSWEAgent, "agent_step") + except Exception: + pass + + try: + import slop_code.metrics.rubric.router as mod_rubric + + unwrap(mod_rubric, "grade_file_async") + except Exception: + pass diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/package.py new file mode 100644 index 000000000..13b6fe785 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/package.py @@ -0,0 +1,17 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +_instruments = ("slop-code-bench >= 0.1",) + +_supports_metrics = True diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/utils.py new file mode 100644 index 000000000..ee7fce73f --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/utils.py @@ -0,0 +1,51 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utility functions for slop-code instrumentation.""" + +from typing import Any, Optional + +from opentelemetry.trace import Span + +SYSTEM_NAME = "slop-code" +MAX_ATTR_LEN = 1024 + + +def safe_get(obj: Any, attr: str, default: Any = None) -> Any: + """Safely get an attribute from an object, returning default on failure.""" + try: + return getattr(obj, attr, default) + except Exception: + return default + + +def safe_get_nested(obj: Any, *attrs: str, default: Any = None) -> Any: + """Safely traverse nested attributes.""" + current = obj + for attr in attrs: + try: + current = getattr(current, attr) + if current is None: + return default + except (AttributeError, TypeError): + return default + return current + + +def set_optional_attr(span: Span, key: str, value: Optional[Any]) -> None: + """Set a span attribute only if value is not None.""" + if value is not None: + if isinstance(value, str) and len(value) > MAX_ATTR_LEN: + value = value[:MAX_ATTR_LEN] + span.set_attribute(key, value) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/version.py new file mode 100644 index 000000000..7bee975f0 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/version.py @@ -0,0 +1,15 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__version__ = "0.5.0.dev" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/__init__.py new file mode 100644 index 000000000..b0a6f4284 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/__init__.py @@ -0,0 +1,13 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/agent.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/agent.py new file mode 100644 index 000000000..94cb2b88a --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/agent.py @@ -0,0 +1,91 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""AGENT span wrapper for Agent.run_checkpoint.""" + +import logging + +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.slop_code.utils import ( + SYSTEM_NAME, + safe_get, + safe_get_nested, + set_optional_attr, +) +from opentelemetry.semconv._incubating.attributes import gen_ai_attributes +from opentelemetry.trace import SpanKind, Status, StatusCode +from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes + +logger = logging.getLogger(__name__) + + +class _AgentRunCheckpointWrapper: + """Wrapper for Agent.run_checkpoint to create AGENT span.""" + + def __init__(self, tracer: trace_api.Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + agent_name = type(instance).__name__ + problem_name = safe_get(instance, "problem_name", "unknown") + + span_name = f"agent.{agent_name}" + + attrs = { + gen_ai_attributes.GEN_AI_OPERATION_NAME: "invoke_agent", + gen_ai_attributes.GEN_AI_SYSTEM: SYSTEM_NAME, + gen_ai_extended_attributes.GEN_AI_SPAN_KIND: gen_ai_extended_attributes.GenAiSpanKindValues.AGENT.value, + "gen_ai.agent.name": agent_name, + "slop_code.problem.name": str(problem_name), + } + + with self._tracer.start_as_current_span( + name=span_name, + kind=SpanKind.INTERNAL, + attributes=attrs, + ) as span: + try: + result = wrapped(*args, **kwargs) + + # Extract after-call attributes from result + if result is not None: + usage = safe_get(result, "usage") + if usage is not None: + net_tokens = safe_get(usage, "net_tokens") + if net_tokens is not None: + set_optional_attr( + span, + gen_ai_attributes.GEN_AI_USAGE_INPUT_TOKENS, + safe_get(net_tokens, "input"), + ) + set_optional_attr( + span, + gen_ai_attributes.GEN_AI_USAGE_OUTPUT_TOKENS, + safe_get(net_tokens, "output"), + ) + cost = safe_get(usage, "cost") + set_optional_attr(span, "slop_code.usage.cost", cost) + steps = safe_get(usage, "steps") + set_optional_attr(span, "slop_code.usage.steps", steps) + + elapsed = safe_get(result, "elapsed") + set_optional_attr(span, "slop_code.elapsed_seconds", elapsed) + + span.set_status(Status(StatusCode.OK)) + return result + except Exception as e: + span.record_exception(e) + span.set_status(Status(StatusCode.ERROR, str(e))) + span.set_attribute("error.type", type(e).__name__) + raise diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/entry.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/entry.py new file mode 100644 index 000000000..d31e666f1 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/entry.py @@ -0,0 +1,58 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ENTRY span wrapper for slop_code.entrypoints.commands.run_agent.run_agent.""" + +import logging + +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.slop_code.utils import ( + SYSTEM_NAME, + safe_get, + safe_get_nested, + set_optional_attr, +) +from opentelemetry.semconv._incubating.attributes import gen_ai_attributes +from opentelemetry.trace import SpanKind, Status, StatusCode +from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes + +logger = logging.getLogger(__name__) + + +class _EntryWrapper: + """Wrapper for run_agent to create ENTRY span.""" + + def __init__(self, tracer: trace_api.Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + span_name = "slop-code.enter" + + with self._tracer.start_as_current_span( + name=span_name, + kind=SpanKind.INTERNAL, + attributes={ + gen_ai_attributes.GEN_AI_OPERATION_NAME: "enter", + gen_ai_attributes.GEN_AI_SYSTEM: SYSTEM_NAME, + gen_ai_extended_attributes.GEN_AI_SPAN_KIND: gen_ai_extended_attributes.GenAiSpanKindValues.ENTRY.value, + }, + ) as span: + try: + result = wrapped(*args, **kwargs) + span.set_status(Status(StatusCode.OK)) + return result + except Exception as e: + span.record_exception(e) + span.set_status(Status(StatusCode.ERROR, str(e))) + raise diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/llm.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/llm.py new file mode 100644 index 000000000..0aaba20b8 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/llm.py @@ -0,0 +1,104 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""LLM span wrapper for grade_file_async (Rubric Judge).""" + +import logging + +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.slop_code.utils import ( + SYSTEM_NAME, + set_optional_attr, +) +from opentelemetry.semconv._incubating.attributes import gen_ai_attributes +from opentelemetry.trace import SpanKind, Status, StatusCode +from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes + +logger = logging.getLogger(__name__) + + +class _RubricGradeWrapper: + """Wrapper for grade_file_async to create LLM span.""" + + def __init__(self, tracer: trace_api.Tracer): + self._tracer = tracer + + async def __call__(self, wrapped, instance, args, kwargs): + # grade_file_async(prompt_prefix, criteria_text, file_name, model, provider, temperature, ...) + model = kwargs.get("model") or (args[3] if len(args) > 3 else "unknown") + provider = kwargs.get("provider") or (args[4] if len(args) > 4 else None) + temperature = kwargs.get("temperature") or (args[5] if len(args) > 5 else None) + + # Determine system name from provider + system_name = SYSTEM_NAME + if provider is not None: + provider_val = provider.value if hasattr(provider, "value") else str(provider) + system_name = provider_val.lower() + + span_name = f"chat {model}" + + attrs = { + gen_ai_attributes.GEN_AI_OPERATION_NAME: "chat", + gen_ai_attributes.GEN_AI_SYSTEM: system_name, + gen_ai_extended_attributes.GEN_AI_SPAN_KIND: gen_ai_extended_attributes.GenAiSpanKindValues.LLM.value, + gen_ai_attributes.GEN_AI_REQUEST_MODEL: str(model), + } + + if temperature is not None: + attrs[gen_ai_attributes.GEN_AI_REQUEST_TEMPERATURE] = float(temperature) + + with self._tracer.start_as_current_span( + name=span_name, + kind=SpanKind.CLIENT, + attributes=attrs, + ) as span: + try: + result = await wrapped(*args, **kwargs) + + # result is tuple[list[dict], dict[str, Any]] + if isinstance(result, tuple) and len(result) >= 2: + response_data = result[1] + if isinstance(response_data, dict): + _set_usage_from_response(span, response_data) + response_id = response_data.get("id") + set_optional_attr(span, "gen_ai.response.id", response_id) + + span.set_status(Status(StatusCode.OK)) + return result + except Exception as e: + span.record_exception(e) + span.set_status(Status(StatusCode.ERROR, str(e))) + raise + + +def _set_usage_from_response(span, response_data: dict) -> None: + """Extract and set token usage attributes from response_data.""" + usage = response_data.get("usage") + if not isinstance(usage, dict): + return + + # OpenRouter format: prompt_tokens / completion_tokens + # Bedrock format (normalized): input_tokens / output_tokens + input_tokens = usage.get("prompt_tokens") or usage.get("input_tokens") + output_tokens = usage.get("completion_tokens") or usage.get("output_tokens") + + set_optional_attr(span, gen_ai_attributes.GEN_AI_USAGE_INPUT_TOKENS, input_tokens) + set_optional_attr(span, gen_ai_attributes.GEN_AI_USAGE_OUTPUT_TOKENS, output_tokens) + + # Cache tokens (OpenRouter specific) + cache_read = usage.get("cache_read_input_tokens") + set_optional_attr(span, gen_ai_extended_attributes.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read) + + cache_creation = usage.get("cache_creation_input_tokens") + set_optional_attr(span, gen_ai_extended_attributes.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS, cache_creation) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/step.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/step.py new file mode 100644 index 000000000..93219fe89 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/step.py @@ -0,0 +1,110 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""STEP span wrapper for MiniSWEAgent.agent_step.""" + +import logging + +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.slop_code.utils import ( + SYSTEM_NAME, + safe_get, + safe_get_nested, + set_optional_attr, +) +from opentelemetry.semconv._incubating.attributes import gen_ai_attributes +from opentelemetry.trace import SpanKind, Status, StatusCode +from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes + +logger = logging.getLogger(__name__) + + +class _MiniSWEStepWrapper: + """Wrapper for MiniSWEAgent.agent_step to create STEP span.""" + + def __init__(self, tracer: trace_api.Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + # Determine current step number (1-based) + usage = safe_get(instance, "usage") + current_steps = safe_get(usage, "steps", 0) if usage else 0 + step_num = current_steps + 1 + + span_name = f"react.step.{step_num}" + + attrs = { + gen_ai_attributes.GEN_AI_OPERATION_NAME: "react", + gen_ai_attributes.GEN_AI_SYSTEM: SYSTEM_NAME, + gen_ai_extended_attributes.GEN_AI_SPAN_KIND: gen_ai_extended_attributes.GenAiSpanKindValues.STEP.value, + gen_ai_extended_attributes.GEN_AI_REACT_ROUND: step_num, + } + + with self._tracer.start_as_current_span( + name=span_name, + kind=SpanKind.INTERNAL, + attributes=attrs, + ) as span: + try: + result = wrapped(*args, **kwargs) + + # Extract token usage from result if available + if isinstance(result, dict): + token_usage = result.get("token_usage") + if token_usage is not None: + set_optional_attr( + span, + gen_ai_attributes.GEN_AI_USAGE_INPUT_TOKENS, + safe_get(token_usage, "input"), + ) + set_optional_attr( + span, + gen_ai_attributes.GEN_AI_USAGE_OUTPUT_TOKENS, + safe_get(token_usage, "output"), + ) + set_optional_attr( + span, + gen_ai_extended_attributes.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, + safe_get(token_usage, "cache_read"), + ) + set_optional_attr( + span, + gen_ai_extended_attributes.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS, + safe_get(token_usage, "cache_write"), + ) + step_cost = result.get("step_cost") + set_optional_attr(span, "slop_code.step.cost", step_cost) + elif result is not None: + # Result might be a tuple or object; try attribute access + token_usage = safe_get(result, "token_usage") + if token_usage is not None: + set_optional_attr( + span, + gen_ai_attributes.GEN_AI_USAGE_INPUT_TOKENS, + safe_get(token_usage, "input"), + ) + set_optional_attr( + span, + gen_ai_attributes.GEN_AI_USAGE_OUTPUT_TOKENS, + safe_get(token_usage, "output"), + ) + + span.set_status(Status(StatusCode.OK)) + span.set_attribute(gen_ai_extended_attributes.GEN_AI_REACT_FINISH_REASON, "stop") + return result + except Exception as e: + span.record_exception(e) + span.set_status(Status(StatusCode.ERROR, str(e))) + span.set_attribute(gen_ai_extended_attributes.GEN_AI_REACT_FINISH_REASON, "error") + raise diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/task.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/task.py new file mode 100644 index 000000000..b0f60a4fc --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/task.py @@ -0,0 +1,91 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""TASK span wrapper for AgentRunner._run_checkpoint.""" + +import logging + +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.slop_code.utils import ( + SYSTEM_NAME, + safe_get, + safe_get_nested, + set_optional_attr, +) +from opentelemetry.semconv._incubating.attributes import gen_ai_attributes +from opentelemetry.trace import SpanKind, Status, StatusCode +from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes + +logger = logging.getLogger(__name__) + + +class _TaskRunCheckpointWrapper: + """Wrapper for AgentRunner._run_checkpoint to create TASK span.""" + + def __init__(self, tracer: trace_api.Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + # _run_checkpoint(self, checkpoint, checkpoint_save_dir, is_first_checkpoint) + checkpoint = args[0] if args else kwargs.get("checkpoint") + is_first_checkpoint = args[2] if len(args) > 2 else kwargs.get("is_first_checkpoint", False) + + checkpoint_name = safe_get(checkpoint, "name", "unknown") + checkpoint_order = safe_get(checkpoint, "order") + + span_name = f"task.{checkpoint_name}" + + attrs = { + gen_ai_attributes.GEN_AI_OPERATION_NAME: "run_task", + gen_ai_attributes.GEN_AI_SYSTEM: SYSTEM_NAME, + gen_ai_extended_attributes.GEN_AI_SPAN_KIND: "TASK", + "slop_code.checkpoint.name": str(checkpoint_name), + } + + if checkpoint_order is not None: + attrs["slop_code.checkpoint.order"] = checkpoint_order + attrs["slop_code.is_first_checkpoint"] = bool(is_first_checkpoint) + + with self._tracer.start_as_current_span( + name=span_name, + kind=SpanKind.INTERNAL, + attributes=attrs, + ) as span: + try: + result = wrapped(*args, **kwargs) + + # Extract after-call attributes from summary + if result is not None: + had_error = safe_get(result, "had_error") + set_optional_attr(span, "slop_code.had_error", had_error) + + passed_policy = safe_get(result, "passed_policy") + set_optional_attr(span, "slop_code.passed_policy", passed_policy) + + # Token usage from agent + agent = safe_get(instance, "agent") + if agent is not None: + net_tokens = safe_get_nested(agent, "usage", "net_tokens") + if net_tokens is not None: + input_tokens = safe_get(net_tokens, "input") + output_tokens = safe_get(net_tokens, "output") + set_optional_attr(span, gen_ai_attributes.GEN_AI_USAGE_INPUT_TOKENS, input_tokens) + set_optional_attr(span, gen_ai_attributes.GEN_AI_USAGE_OUTPUT_TOKENS, output_tokens) + + span.set_status(Status(StatusCode.OK)) + return result + except Exception as e: + span.record_exception(e) + span.set_status(Status(StatusCode.ERROR, str(e))) + raise diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/workflow.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/workflow.py new file mode 100644 index 000000000..4793d4286 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/workflow.py @@ -0,0 +1,120 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""CHAIN/workflow span wrapper for run_agent_on_problem.""" + +import logging + +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.slop_code.utils import ( + SYSTEM_NAME, + safe_get, + safe_get_nested, + set_optional_attr, +) +from opentelemetry.semconv._incubating.attributes import gen_ai_attributes +from opentelemetry.trace import SpanKind, Status, StatusCode +from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes + +logger = logging.getLogger(__name__) + + +class _WorkflowWrapper: + """Wrapper for run_agent_on_problem to create workflow (CHAIN) span.""" + + def __init__(self, tracer: trace_api.Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + # run_agent_on_problem(problem_config, problem_name, config, progress_queue, output_path) + problem_name = args[1] if len(args) > 1 else kwargs.get("problem_name", "unknown") + config = args[2] if len(args) > 2 else kwargs.get("config") + + span_name = f"workflow.{problem_name}" + + attrs = { + gen_ai_attributes.GEN_AI_OPERATION_NAME: "workflow", + gen_ai_attributes.GEN_AI_SYSTEM: SYSTEM_NAME, + gen_ai_extended_attributes.GEN_AI_SPAN_KIND: "CHAIN", + "slop_code.problem.name": str(problem_name), + } + + # Extract optional attributes from config + if config is not None: + model_name = safe_get_nested(config, "model_def", "name") + set_optional_attr_dict(attrs, gen_ai_attributes.GEN_AI_REQUEST_MODEL, model_name) + + agent_type = safe_get_nested(config, "agent_config", "type") + set_optional_attr_dict(attrs, "slop_code.agent.type", agent_type) + + pass_policy = safe_get_nested(config, "pass_policy", "value") + if pass_policy is None: + pass_policy_obj = safe_get(config, "pass_policy") + if pass_policy_obj is not None and hasattr(pass_policy_obj, "value"): + pass_policy = pass_policy_obj.value + set_optional_attr_dict(attrs, "slop_code.pass_policy", pass_policy) + + try: + with self._tracer.start_as_current_span( + name=span_name, + kind=SpanKind.INTERNAL, + attributes={k: v for k, v in attrs.items() if v is not None}, + ) as span: + try: + result = wrapped(*args, **kwargs) + + if isinstance(result, dict): + summary = result.get("summary") + if isinstance(summary, dict): + set_optional_attr( + span, "slop_code.state", summary.get("state") + ) + set_optional_attr( + span, + "slop_code.passed_policy", + summary.get("passed_policy"), + ) + + span.set_status(Status(StatusCode.OK)) + return result + except Exception as e: + span.record_exception(e) + span.set_status(Status(StatusCode.ERROR, str(e))) + raise + finally: + # Flush AFTER the `with` block so the workflow span itself + # is `on_end`-delivered to the SpanProcessor before we ask it + # to drain. run_agent_on_problem is the last meaningful work + # item inside the per-problem worker subprocess; once it + # returns, the process is reaped by ProcessPoolExecutor's + # shutdown which can short-circuit BatchSpanProcessor's + # atexit handler. Without this explicit flush the CHAIN span + # (and the tail batch of TASK/AGENT/STEP spans) gets dropped. + try: + provider = trace_api.get_tracer_provider() + flush = getattr(provider, "force_flush", None) + if callable(flush): + flush(timeout_millis=5000) + except Exception as flush_err: # noqa: BLE001 + logger.debug( + "force_flush after workflow span failed: %s", flush_err + ) + + +def set_optional_attr_dict(attrs: dict, key: str, value) -> None: + """Add to attrs dict only if value is not None.""" + if value is not None: + if isinstance(value, str) and len(value) > 1024: + value = value[:1024] + attrs[key] = value diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/test-requirements.txt b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/test-requirements.txt new file mode 100644 index 000000000..9facd6bc9 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/test-requirements.txt @@ -0,0 +1,8 @@ +pytest +pytest-asyncio +pytest-forked==1.6.0 +opentelemetry-api +opentelemetry-sdk +opentelemetry-instrumentation +opentelemetry-semantic-conventions +wrapt diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/conftest.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/conftest.py new file mode 100644 index 000000000..dcda695d0 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/conftest.py @@ -0,0 +1,209 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Test configuration for slop-code instrumentation tests.""" + +import os +import sys +import types +from unittest.mock import MagicMock + +import pytest + +os.environ["OTEL_SEMCONV_STABILITY_OPT_IN"] = "gen_ai_latest_experimental" + + +def _make_module(name): + """Create a real module object.""" + mod = types.ModuleType(name) + mod.__package__ = name.rsplit(".", 1)[0] if "." in name else name + return mod + + +def _create_mock_slop_code_modules(): + """Create mock modules for slop_code so instrumentation can wrap them.""" + # Create all parent modules + mod_slop_code = _make_module("slop_code") + mod_entrypoints = _make_module("slop_code.entrypoints") + mod_commands = _make_module("slop_code.entrypoints.commands") + mod_run_agent = _make_module("slop_code.entrypoints.commands.run_agent") + mod_problem_runner = _make_module("slop_code.entrypoints.problem_runner") + mod_worker = _make_module("slop_code.entrypoints.problem_runner.worker") + mod_driver = _make_module("slop_code.entrypoints.problem_runner.driver") + mod_agent_runner = _make_module("slop_code.agent_runner") + mod_runner = _make_module("slop_code.agent_runner.runner") + mod_agent = _make_module("slop_code.agent_runner.agent") + mod_agents = _make_module("slop_code.agent_runner.agents") + mod_miniswe = _make_module("slop_code.agent_runner.agents.miniswe") + mod_metrics = _make_module("slop_code.metrics") + mod_rubric = _make_module("slop_code.metrics.rubric") + mod_router = _make_module("slop_code.metrics.rubric.router") + + # --- ENTRY: run_agent --- + def run_agent(*args, **kwargs): + return {"status": "completed"} + + mod_run_agent.run_agent = run_agent + + # --- WORKFLOW: run_agent_on_problem --- + def run_agent_on_problem(*args, **kwargs): + return {"summary": {"state": "completed", "passed_policy": True}} + + mod_worker.run_agent_on_problem = run_agent_on_problem + # driver re-imports the worker name at module load time. This mock mirrors + # the same pattern so the instrumentor's driver-side patch has a target. + mod_driver.run_agent_on_problem = run_agent_on_problem + + # --- TASK: AgentRunner._run_checkpoint --- + class AgentRunner: + def __init__(self): + self.agent = MagicMock() + self.agent.usage = MagicMock() + self.agent.usage.net_tokens = MagicMock() + self.agent.usage.net_tokens.input = 100 + self.agent.usage.net_tokens.output = 50 + + def _run_checkpoint(self, checkpoint, checkpoint_save_dir, is_first_checkpoint=False): + result = MagicMock() + result.had_error = False + result.passed_policy = True + return result + + mod_runner.AgentRunner = AgentRunner + + # --- AGENT: Agent.run_checkpoint --- + class Agent: + def __init__(self, problem_name="test_problem"): + self.problem_name = problem_name + self.usage = MagicMock() + self.usage.net_tokens = MagicMock() + self.usage.net_tokens.input = 100 + self.usage.net_tokens.output = 50 + self.usage.steps = 0 + self.usage.cost = 0.05 + + def run_checkpoint(self, task): + result = MagicMock() + result.usage = self.usage + result.elapsed = 10.5 + return result + + mod_agent.Agent = Agent + + # --- STEP: MiniSWEAgent.agent_step --- + class MiniSWEAgent(Agent): + def __init__(self, problem_name="test_problem"): + super().__init__(problem_name) + + def agent_step(self): + return { + "token_usage": MagicMock(input=200, output=80, cache_read=50, cache_write=10), + "step_cost": 0.01, + } + + mod_miniswe.MiniSWEAgent = MiniSWEAgent + + # --- LLM: grade_file_async --- + async def grade_file_async(*args, **kwargs): + grades = [{"score": 8, "reasoning": "Good code"}] + response_data = { + "id": "resp-123", + "usage": { + "prompt_tokens": 500, + "completion_tokens": 200, + "cache_read_input_tokens": 100, + "cache_creation_input_tokens": 50, + }, + } + return grades, response_data + + mod_router.grade_file_async = grade_file_async + + # Wire parent-child relationships + mod_slop_code.entrypoints = mod_entrypoints + mod_slop_code.agent_runner = mod_agent_runner + mod_slop_code.metrics = mod_metrics + mod_entrypoints.commands = mod_commands + mod_entrypoints.problem_runner = mod_problem_runner + mod_commands.run_agent = mod_run_agent + mod_problem_runner.worker = mod_worker + mod_problem_runner.driver = mod_driver + mod_agent_runner.runner = mod_runner + mod_agent_runner.agent = mod_agent + mod_agent_runner.agents = mod_agents + mod_agents.miniswe = mod_miniswe + mod_metrics.rubric = mod_rubric + mod_rubric.router = mod_router + + # Register all modules in sys.modules + modules = { + "slop_code": mod_slop_code, + "slop_code.entrypoints": mod_entrypoints, + "slop_code.entrypoints.commands": mod_commands, + "slop_code.entrypoints.commands.run_agent": mod_run_agent, + "slop_code.entrypoints.problem_runner": mod_problem_runner, + "slop_code.entrypoints.problem_runner.worker": mod_worker, + "slop_code.entrypoints.problem_runner.driver": mod_driver, + "slop_code.agent_runner": mod_agent_runner, + "slop_code.agent_runner.runner": mod_runner, + "slop_code.agent_runner.agent": mod_agent, + "slop_code.agent_runner.agents": mod_agents, + "slop_code.agent_runner.agents.miniswe": mod_miniswe, + "slop_code.metrics": mod_metrics, + "slop_code.metrics.rubric": mod_rubric, + "slop_code.metrics.rubric.router": mod_router, + } + + for name, mod in modules.items(): + sys.modules[name] = mod + + return modules + + +# Install mock modules before any instrumentation imports +_mock_modules = _create_mock_slop_code_modules() + + +@pytest.fixture(scope="function") +def span_exporter(): + from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( + InMemorySpanExporter, + ) + + exporter = InMemorySpanExporter() + yield exporter + exporter.clear() + + +@pytest.fixture(scope="function") +def tracer_provider(span_exporter): + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import SimpleSpanProcessor + + provider = TracerProvider() + provider.add_span_processor(SimpleSpanProcessor(span_exporter)) + return provider + + +@pytest.fixture(scope="function") +def instrument(tracer_provider): + from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor + + instrumentor = SlopCodeInstrumentor() + instrumentor.instrument( + tracer_provider=tracer_provider, + skip_dep_check=True, + ) + yield instrumentor + instrumentor.uninstrument() diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_agent_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_agent_span.py new file mode 100644 index 000000000..d372ba220 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_agent_span.py @@ -0,0 +1,102 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for AGENT span (Agent.run_checkpoint).""" + +from unittest.mock import MagicMock + +import pytest + +from opentelemetry.trace import StatusCode + + +class TestAgentSpan: + """Verify that Agent.run_checkpoint produces an AGENT span.""" + + def test_agent_span_created(self, span_exporter, instrument): + """Agent.run_checkpoint should create an AGENT span.""" + import slop_code.agent_runner.agent as mod + + agent = mod.Agent(problem_name="file_backup") + result = agent.run_checkpoint("solve the bug") + + spans = span_exporter.get_finished_spans() + agent_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "invoke_agent" + ] + assert len(agent_spans) == 1 + + span = agent_spans[0] + assert span.name == "agent.Agent" + assert span.attributes["gen_ai.system"] == "slop-code" + assert span.attributes["gen_ai.span.kind"] == "AGENT" + assert span.attributes["gen_ai.agent.name"] == "Agent" + assert span.attributes["slop_code.problem.name"] == "file_backup" + assert span.status.status_code == StatusCode.OK + + def test_agent_span_captures_usage(self, span_exporter, instrument): + """AGENT span should capture token usage from result.""" + import slop_code.agent_runner.agent as mod + + agent = mod.Agent(problem_name="test_prob") + agent.run_checkpoint("task") + + spans = span_exporter.get_finished_spans() + agent_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "invoke_agent" + ] + assert len(agent_spans) == 1 + span = agent_spans[0] + + assert "gen_ai.usage.input_tokens" in span.attributes + assert "gen_ai.usage.output_tokens" in span.attributes + assert span.attributes["gen_ai.usage.input_tokens"] == 100 + assert span.attributes["gen_ai.usage.output_tokens"] == 50 + + def test_agent_span_error(self, span_exporter, tracer_provider): + """Exception in Agent.run_checkpoint should produce error span.""" + import slop_code.agent_runner.agent as mod + + from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor + + class FailingAgent(mod.Agent): + def run_checkpoint(self, task): + raise TimeoutError("Agent timeout") + + OriginalAgent = mod.Agent + mod.Agent = FailingAgent + + instrumentor = SlopCodeInstrumentor() + instrumentor.instrument(tracer_provider=tracer_provider, skip_dep_check=True) + + try: + agent = mod.Agent(problem_name="test_prob") + + with pytest.raises(TimeoutError, match="Agent timeout"): + agent.run_checkpoint("task") + + spans = span_exporter.get_finished_spans() + agent_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "invoke_agent" + ] + assert len(agent_spans) == 1 + span = agent_spans[0] + assert span.status.status_code == StatusCode.ERROR + assert span.attributes.get("error.type") == "TimeoutError" + finally: + instrumentor.uninstrument() + mod.Agent = OriginalAgent diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_entry_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_entry_span.py new file mode 100644 index 000000000..2f7c1751f --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_entry_span.py @@ -0,0 +1,74 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for ENTRY span (run_agent).""" + +import pytest + +from opentelemetry.trace import StatusCode + + +class TestEntrySpan: + """Verify that run_agent produces an ENTRY span.""" + + def test_entry_span_created(self, span_exporter, instrument): + """run_agent should create an ENTRY span with correct attributes.""" + import slop_code.entrypoints.commands.run_agent as mod + + mod.run_agent() + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "ENTRY" + ] + assert len(entry_spans) == 1 + + span = entry_spans[0] + assert span.name == "slop-code.enter" + assert span.attributes["gen_ai.system"] == "slop-code" + assert span.attributes["gen_ai.operation.name"] == "enter" + assert span.status.status_code == StatusCode.OK + + def test_entry_span_error(self, span_exporter, tracer_provider): + """run_agent raising an exception should produce an error ENTRY span.""" + import slop_code.entrypoints.commands.run_agent as mod + + from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor + + # Store original and replace with failing function + original = mod.run_agent + + def failing_run_agent(*args, **kwargs): + raise RuntimeError("Config error") + + mod.run_agent = failing_run_agent + + instrumentor = SlopCodeInstrumentor() + instrumentor.instrument(tracer_provider=tracer_provider, skip_dep_check=True) + + try: + with pytest.raises(RuntimeError, match="Config error"): + mod.run_agent() + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "ENTRY" + ] + assert len(entry_spans) == 1 + assert entry_spans[0].status.status_code == StatusCode.ERROR + finally: + instrumentor.uninstrument() + mod.run_agent = original diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_hierarchy.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_hierarchy.py new file mode 100644 index 000000000..d33cc3568 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_hierarchy.py @@ -0,0 +1,118 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for span hierarchy and parent-child relationships.""" + +from unittest.mock import MagicMock + +import pytest + +from opentelemetry.trace import StatusCode + + +class TestSpanHierarchy: + """Verify parent-child relationships between spans.""" + + def test_entry_is_parent_of_workflow(self, span_exporter, instrument): + """ENTRY span should be parent of workflow span when called inline.""" + import slop_code.entrypoints.commands.run_agent as entry_mod + import slop_code.entrypoints.problem_runner.worker as worker_mod + + # Patch run_agent to call run_agent_on_problem internally + original = entry_mod.run_agent.__wrapped__ + + def run_with_workflow(*args, **kwargs): + config = MagicMock() + config.model_def = None + config.agent_config = None + config.pass_policy = None + return worker_mod.run_agent_on_problem( + MagicMock(), "test_problem", config, MagicMock(), "/tmp" + ) + + entry_mod.run_agent.__wrapped__ = run_with_workflow + + try: + entry_mod.run_agent() + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "ENTRY" + ] + workflow_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "workflow" + ] + + assert len(entry_spans) == 1 + assert len(workflow_spans) == 1 + + entry_span = entry_spans[0] + workflow_span = workflow_spans[0] + + # workflow should be child of entry + assert workflow_span.context.trace_id == entry_span.context.trace_id + assert workflow_span.parent is not None + assert workflow_span.parent.span_id == entry_span.context.span_id + finally: + entry_mod.run_agent.__wrapped__ = original + + def test_workflow_is_parent_of_task(self, span_exporter, instrument): + """Workflow span should be parent of task span when called inline.""" + import slop_code.agent_runner.runner as runner_mod + import slop_code.entrypoints.problem_runner.worker as worker_mod + + original = worker_mod.run_agent_on_problem.__wrapped__ + + def workflow_with_task(*args, **kwargs): + r = runner_mod.AgentRunner() + checkpoint = MagicMock() + checkpoint.name = "cp1" + checkpoint.order = 1 + r._run_checkpoint(checkpoint, "/tmp", True) + return {"summary": {"state": "completed", "passed_policy": True}} + + worker_mod.run_agent_on_problem.__wrapped__ = workflow_with_task + + try: + config = MagicMock() + config.model_def = None + config.agent_config = None + config.pass_policy = None + worker_mod.run_agent_on_problem( + MagicMock(), "prob1", config, MagicMock(), "/tmp" + ) + + spans = span_exporter.get_finished_spans() + workflow_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "workflow" + ] + task_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "run_task" + ] + + assert len(workflow_spans) == 1 + assert len(task_spans) == 1 + + workflow_span = workflow_spans[0] + task_span = task_spans[0] + + assert task_span.context.trace_id == workflow_span.context.trace_id + assert task_span.parent is not None + assert task_span.parent.span_id == workflow_span.context.span_id + finally: + worker_mod.run_agent_on_problem.__wrapped__ = original diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_llm_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_llm_span.py new file mode 100644 index 000000000..c88e46430 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_llm_span.py @@ -0,0 +1,142 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for LLM span (grade_file_async - Rubric Judge).""" + +from unittest.mock import MagicMock + +import pytest + +from opentelemetry.trace import SpanKind, StatusCode + + +@pytest.mark.asyncio +class TestLLMSpan: + """Verify that grade_file_async produces an LLM span.""" + + async def test_llm_span_created(self, span_exporter, instrument): + """grade_file_async should create an LLM span.""" + import slop_code.metrics.rubric.router as mod + + provider = MagicMock() + provider.value = "openrouter" + + grades, resp = await mod.grade_file_async( + "prompt_prefix", + "criteria_text", + "test.py", + "anthropic/claude-3.5-sonnet", + provider, + 0.7, + ) + + spans = span_exporter.get_finished_spans() + llm_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "LLM" + ] + assert len(llm_spans) == 1 + + span = llm_spans[0] + assert span.name == "chat anthropic/claude-3.5-sonnet" + assert span.attributes["gen_ai.system"] == "openrouter" + assert span.attributes["gen_ai.operation.name"] == "chat" + assert span.attributes["gen_ai.request.model"] == "anthropic/claude-3.5-sonnet" + assert span.attributes["gen_ai.request.temperature"] == 0.7 + assert span.kind == SpanKind.CLIENT + assert span.status.status_code == StatusCode.OK + + async def test_llm_span_captures_usage(self, span_exporter, instrument): + """LLM span should capture token usage from response.""" + import slop_code.metrics.rubric.router as mod + + provider = MagicMock() + provider.value = "openrouter" + + await mod.grade_file_async( + "prefix", "criteria", "file.py", + "anthropic/claude-3.5-sonnet", provider, 0.5, + ) + + spans = span_exporter.get_finished_spans() + llm_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "LLM" + ] + assert len(llm_spans) == 1 + span = llm_spans[0] + + assert span.attributes["gen_ai.usage.input_tokens"] == 500 + assert span.attributes["gen_ai.usage.output_tokens"] == 200 + assert span.attributes["gen_ai.usage.cache_read.input_tokens"] == 100 + assert span.attributes["gen_ai.usage.cache_creation.input_tokens"] == 50 + assert span.attributes["gen_ai.response.id"] == "resp-123" + + async def test_llm_span_error(self, span_exporter, tracer_provider): + """Exception in grade_file_async should produce an error LLM span.""" + import slop_code.metrics.rubric.router as mod + + from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor + + original = mod.grade_file_async + + async def failing_grade(*args, **kwargs): + raise ConnectionError("API unreachable") + + mod.grade_file_async = failing_grade + + instrumentor = SlopCodeInstrumentor() + instrumentor.instrument(tracer_provider=tracer_provider, skip_dep_check=True) + + provider = MagicMock() + provider.value = "bedrock" + + try: + with pytest.raises(ConnectionError, match="API unreachable"): + await mod.grade_file_async( + "prefix", "criteria", "file.py", + "us.anthropic.claude-3-5-sonnet-20241022-v2:0", provider, 0.3, + ) + + spans = span_exporter.get_finished_spans() + llm_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "LLM" + ] + assert len(llm_spans) == 1 + assert llm_spans[0].status.status_code == StatusCode.ERROR + assert llm_spans[0].attributes["gen_ai.system"] == "bedrock" + finally: + instrumentor.uninstrument() + mod.grade_file_async = original + + async def test_llm_span_bedrock_provider(self, span_exporter, instrument): + """LLM span with bedrock provider should use 'bedrock' as system.""" + import slop_code.metrics.rubric.router as mod + + provider = MagicMock() + provider.value = "bedrock" + + await mod.grade_file_async( + "prefix", "criteria", "file.py", + "us.anthropic.claude-3-5-sonnet-20241022-v2:0", provider, 0.5, + ) + + spans = span_exporter.get_finished_spans() + llm_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "LLM" + ] + assert len(llm_spans) == 1 + assert llm_spans[0].attributes["gen_ai.system"] == "bedrock" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_step_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_step_span.py new file mode 100644 index 000000000..70e221da2 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_step_span.py @@ -0,0 +1,133 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for STEP span (MiniSWEAgent.agent_step).""" + +from unittest.mock import MagicMock + +import pytest + +from opentelemetry.trace import StatusCode + + +class TestStepSpan: + """Verify that MiniSWEAgent.agent_step produces a STEP span.""" + + def test_step_span_created(self, span_exporter, instrument): + """agent_step should create a STEP span with token attributes.""" + import slop_code.agent_runner.agents.miniswe as mod + + agent = mod.MiniSWEAgent(problem_name="test_prob") + result = agent.agent_step() + + spans = span_exporter.get_finished_spans() + step_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "STEP" + ] + assert len(step_spans) == 1 + + span = step_spans[0] + assert span.name == "react.step.1" + assert span.attributes["gen_ai.system"] == "slop-code" + assert span.attributes["gen_ai.operation.name"] == "react" + assert span.attributes["gen_ai.react.round"] == 1 + assert span.status.status_code == StatusCode.OK + + def test_step_span_has_token_usage(self, span_exporter, instrument): + """STEP span should capture token usage from result.""" + import slop_code.agent_runner.agents.miniswe as mod + + agent = mod.MiniSWEAgent(problem_name="test_prob") + agent.agent_step() + + spans = span_exporter.get_finished_spans() + step_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "STEP" + ] + assert len(step_spans) == 1 + span = step_spans[0] + + assert span.attributes["gen_ai.usage.input_tokens"] == 200 + assert span.attributes["gen_ai.usage.output_tokens"] == 80 + assert span.attributes["gen_ai.usage.cache_read.input_tokens"] == 50 + assert span.attributes["gen_ai.usage.cache_creation.input_tokens"] == 10 + + def test_step_span_increments_round(self, span_exporter, instrument): + """Multiple agent_step calls should increment the round number.""" + import slop_code.agent_runner.agents.miniswe as mod + + agent = mod.MiniSWEAgent(problem_name="test_prob") + # Simulate steps=2 already completed + agent.usage.steps = 2 + agent.agent_step() + + spans = span_exporter.get_finished_spans() + step_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "STEP" + ] + assert len(step_spans) == 1 + assert step_spans[0].name == "react.step.3" + assert step_spans[0].attributes["gen_ai.react.round"] == 3 + + def test_step_span_error(self, span_exporter, tracer_provider): + """Exception in agent_step should produce an error STEP span.""" + import slop_code.agent_runner.agents.miniswe as mod + + from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor + + class FailingMiniSWE(mod.MiniSWEAgent): + def agent_step(self): + raise RuntimeError("LimitsExceeded") + + OriginalClass = mod.MiniSWEAgent + mod.MiniSWEAgent = FailingMiniSWE + + instrumentor = SlopCodeInstrumentor() + instrumentor.instrument(tracer_provider=tracer_provider, skip_dep_check=True) + + try: + agent = mod.MiniSWEAgent(problem_name="test_prob") + + with pytest.raises(RuntimeError, match="LimitsExceeded"): + agent.agent_step() + + spans = span_exporter.get_finished_spans() + step_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "STEP" + ] + assert len(step_spans) == 1 + span = step_spans[0] + assert span.status.status_code == StatusCode.ERROR + assert span.attributes["gen_ai.react.finish_reason"] == "error" + finally: + instrumentor.uninstrument() + mod.MiniSWEAgent = OriginalClass + + def test_step_span_finish_reason_stop(self, span_exporter, instrument): + """Successful step should have finish_reason='stop'.""" + import slop_code.agent_runner.agents.miniswe as mod + + agent = mod.MiniSWEAgent(problem_name="test_prob") + agent.agent_step() + + spans = span_exporter.get_finished_spans() + step_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "STEP" + ] + assert step_spans[0].attributes["gen_ai.react.finish_reason"] == "stop" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_task_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_task_span.py new file mode 100644 index 000000000..de3e16a95 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_task_span.py @@ -0,0 +1,110 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for TASK span (AgentRunner._run_checkpoint).""" + +from unittest.mock import MagicMock + +import pytest + +from opentelemetry.trace import StatusCode + + +class TestTaskSpan: + """Verify that AgentRunner._run_checkpoint produces a TASK span.""" + + def test_task_span_created(self, span_exporter, instrument): + """_run_checkpoint should create a task span.""" + import slop_code.agent_runner.runner as mod + + runner = mod.AgentRunner() + + checkpoint = MagicMock() + checkpoint.name = "checkpoint_1" + checkpoint.order = 1 + + result = runner._run_checkpoint(checkpoint, "/tmp/save", True) + + spans = span_exporter.get_finished_spans() + task_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "run_task" + ] + assert len(task_spans) == 1 + + span = task_spans[0] + assert span.name == "task.checkpoint_1" + assert span.attributes["gen_ai.system"] == "slop-code" + assert span.attributes["gen_ai.span.kind"] == "TASK" + assert span.attributes["slop_code.checkpoint.name"] == "checkpoint_1" + assert span.attributes["slop_code.checkpoint.order"] == 1 + assert span.attributes["slop_code.is_first_checkpoint"] is True + assert span.status.status_code == StatusCode.OK + + def test_task_span_error(self, span_exporter, tracer_provider): + """Exception in _run_checkpoint should produce an error task span.""" + import slop_code.agent_runner.runner as mod + + from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor + + class FailingRunner(mod.AgentRunner): + def _run_checkpoint(self, checkpoint, checkpoint_save_dir, is_first_checkpoint=False): + raise RuntimeError("Checkpoint failed") + + # Replace class temporarily + OriginalRunner = mod.AgentRunner + mod.AgentRunner = FailingRunner + + instrumentor = SlopCodeInstrumentor() + instrumentor.instrument(tracer_provider=tracer_provider, skip_dep_check=True) + + try: + runner = mod.AgentRunner() + checkpoint = MagicMock() + checkpoint.name = "bad_checkpoint" + checkpoint.order = 2 + + with pytest.raises(RuntimeError, match="Checkpoint failed"): + runner._run_checkpoint(checkpoint, "/tmp/save", False) + + spans = span_exporter.get_finished_spans() + task_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "run_task" + ] + assert len(task_spans) == 1 + assert task_spans[0].status.status_code == StatusCode.ERROR + finally: + instrumentor.uninstrument() + mod.AgentRunner = OriginalRunner + + def test_task_span_not_first_checkpoint(self, span_exporter, instrument): + """Subsequent checkpoint should have is_first_checkpoint=False.""" + import slop_code.agent_runner.runner as mod + + runner = mod.AgentRunner() + + checkpoint = MagicMock() + checkpoint.name = "checkpoint_2" + checkpoint.order = 2 + + runner._run_checkpoint(checkpoint, "/tmp/save", False) + + spans = span_exporter.get_finished_spans() + task_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "run_task" + ] + assert len(task_spans) == 1 + assert task_spans[0].attributes["slop_code.is_first_checkpoint"] is False diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_workflow_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_workflow_span.py new file mode 100644 index 000000000..6d0a79ddc --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_workflow_span.py @@ -0,0 +1,117 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for CHAIN/workflow span (run_agent_on_problem).""" + +from unittest.mock import MagicMock + +import pytest + +from opentelemetry.trace import StatusCode + + +class TestWorkflowSpan: + """Verify that run_agent_on_problem produces a workflow span.""" + + def test_workflow_span_created(self, span_exporter, instrument): + """run_agent_on_problem should create a workflow span.""" + import slop_code.entrypoints.problem_runner.worker as mod + + config = MagicMock() + config.model_def = MagicMock() + config.model_def.name = "anthropic/claude-3.5-sonnet" + config.agent_config = MagicMock() + config.agent_config.type = "claude_code" + config.pass_policy = MagicMock() + config.pass_policy.value = "any" + + result = mod.run_agent_on_problem( + MagicMock(), # problem_config + "file_backup", # problem_name + config, # config + MagicMock(), # progress_queue + "/tmp/output", # output_path + ) + + spans = span_exporter.get_finished_spans() + workflow_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "workflow" + ] + assert len(workflow_spans) == 1 + + span = workflow_spans[0] + assert span.name == "workflow.file_backup" + assert span.attributes["gen_ai.system"] == "slop-code" + assert span.attributes["gen_ai.span.kind"] == "CHAIN" + assert span.attributes["slop_code.problem.name"] == "file_backup" + assert span.attributes["gen_ai.request.model"] == "anthropic/claude-3.5-sonnet" + assert span.attributes["slop_code.agent.type"] == "claude_code" + assert span.status.status_code == StatusCode.OK + + def test_workflow_span_error(self, span_exporter, tracer_provider): + """Exception in run_agent_on_problem should produce error workflow span.""" + import slop_code.entrypoints.problem_runner.worker as mod + + from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor + + original = mod.run_agent_on_problem + + def failing_worker(*args, **kwargs): + raise ValueError("Problem not found") + + mod.run_agent_on_problem = failing_worker + + instrumentor = SlopCodeInstrumentor() + instrumentor.instrument(tracer_provider=tracer_provider, skip_dep_check=True) + + try: + with pytest.raises(ValueError, match="Problem not found"): + mod.run_agent_on_problem( + MagicMock(), "missing_problem", MagicMock(), MagicMock(), "/tmp" + ) + + spans = span_exporter.get_finished_spans() + workflow_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "workflow" + ] + assert len(workflow_spans) == 1 + assert workflow_spans[0].status.status_code == StatusCode.ERROR + finally: + instrumentor.uninstrument() + mod.run_agent_on_problem = original + + def test_workflow_span_with_none_config_fields(self, span_exporter, instrument): + """Workflow span should handle None config fields gracefully.""" + import slop_code.entrypoints.problem_runner.worker as mod + + config = MagicMock() + config.model_def = None + config.agent_config = None + config.pass_policy = None + + mod.run_agent_on_problem( + MagicMock(), "test_problem", config, MagicMock(), "/tmp" + ) + + spans = span_exporter.get_finished_spans() + workflow_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "workflow" + ] + assert len(workflow_spans) == 1 + span = workflow_spans[0] + assert span.attributes["slop_code.problem.name"] == "test_problem" + assert "gen_ai.request.model" not in span.attributes diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/LICENSE b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/pyproject.toml new file mode 100644 index 000000000..62aaa6e5a --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/pyproject.toml @@ -0,0 +1,52 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "loongsuite-instrumentation-terminus2" +dynamic = ["version"] +description = "LoongSuite Terminus2 Instrumentation" +license = "Apache-2.0" +requires-python = ">=3.8" +authors = [ + { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +dependencies = [ + "wrapt >= 1.0.0, < 2.0.0", +] + +[project.optional-dependencies] +instruments = [ + "terminal-bench >= 0.1.0", +] + +[project.entry-points.opentelemetry_instrumentor] +terminus2 = "opentelemetry.instrumentation.terminus2:Terminus2Instrumentor" + +[project.urls] +Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-terminus2" +Repository = "https://github.com/alibaba/loongsuite-python-agent" + +[tool.hatch.version] +path = "src/opentelemetry/instrumentation/terminus2/version.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/opentelemetry"] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/__init__.py new file mode 100644 index 000000000..026ba3c12 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/__init__.py @@ -0,0 +1,802 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +OpenTelemetry Terminus2 Instrumentation + +Provides automatic instrumentation for the terminus-2 agent from terminal-bench +via external monkey patching (no upstream changes required). + +Span hierarchy & semantic mapping (strictly follows ARMS gen-ai semantic +conventions, see ``arms_docs/trace/gen-ai.md``): + + enter_ai_application_system (ENTRY / enter) + └── invoke_agent terminus-2 (AGENT / invoke_agent) + └── react step (STEP / react) ── episode N + ├── (LLM span produced by ``opentelemetry-instrumentation-litellm``) + ├── run_task parse_response (TASK / run_task) + ├── chain summarize (CHAIN / task) ── on overflow + └── execute_tool terminal (TOOL / execute_tool) + +LLM spans are intentionally **not** produced by this package. The underlying +``LiteLLM.call`` invokes ``litellm.completion`` which is already traced by +``opentelemetry-instrumentation-litellm``; emitting another span here would +duplicate that record. + +Patch targets (all monkey-patched via ``wrapt.wrap_function_wrapper``): + + P0 Terminus2.perform_task → ENTRY span (application entry) + P0 Terminus2._run_agent_loop → AGENT span + episode lifecycle + P0 Terminus2._execute_commands → TOOL span + P1 Terminus2._handle_llm_interaction → STEP span (per ReAct iteration) + P1 TerminusJSONPlainParser.parse_response / + TerminusXMLPlainParser.parse_response → TASK span + P2 Terminus2._summarize → CHAIN span (handoff) +""" + +import contextvars +import json +import logging +from typing import Any, Collection + +from opentelemetry import context as context_api +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor +from opentelemetry.instrumentation.utils import unwrap +from opentelemetry.trace import SpanKind, Status, StatusCode +from wrapt import wrap_function_wrapper + +from aliyun.semconv.trace_v2 import ( + CommonAttributes, + GenAiOperationName, + GenAiSpanKind, + GenAiToolType, + LLMAttributes, + ToolAttributes, +) + +from aliyun.sdk.extension.arms.self_monitor.self_monitor_decorator import hook_advice + +from opentelemetry.instrumentation.terminus2.package import _instruments + +logger = logging.getLogger(__name__) + +# ── Framework / agent identifiers ──────────────────────────────────────────── +_FRAMEWORK = "terminal-bench" +_AGENT_NAME = "terminus-2" +_TERMINAL_TOOL_NAME = "terminal" +_TERMINAL_TOOL_DESCRIPTION = "Send keystrokes to a tmux terminal session" + +# Spec-defined tool I/O attribute keys (not yet exposed as constants in +# aliyun.semconv.trace_v2.ToolAttributes; see gen-ai.md §Tool). +_GEN_AI_TOOL_CALL_ARGUMENTS = "gen_ai.tool.call.arguments" +_GEN_AI_TOOL_CALL_RESULT = "gen_ai.tool.call.result" + +# ── Span kind / operation values not present in trace_v2 enums ─────────────── +_SPAN_KIND_ENTRY = "ENTRY" +_SPAN_KIND_STEP = "STEP" +_OP_ENTER = "enter" +_OP_REACT = "react" +_OP_RUN_TASK = "run_task" +_OP_TASK = "task" + +# ── ReAct extension attributes (阿里云扩展规范) ────────────────────────────── +_GEN_AI_REACT_ROUND = "gen_ai.react.round" +_GEN_AI_REACT_FINISH_REASON = "gen_ai.react.finish_reason" + +# ── Content capture ───────────────────────────────────────────────────────── +# Inputs / outputs (instruction text, terminal keystrokes, terminal output, +# AgentResult summary) are captured **unconditionally and untruncated** — +# they are the primary observability signal for terminus-2. If full content +# is undesirable in a given deployment, configure exporter-side filtering or +# attribute-length limits in the SDK instead. + + +def _commands_to_arguments_json(commands) -> str: + """Serialize a list of ``Command`` objects into a JSON string for + ``gen_ai.tool.call.arguments``.""" + serialized = [] + for cmd in commands: + serialized.append({ + "keystrokes": getattr(cmd, "keystrokes", ""), + "duration_sec": getattr(cmd, "duration_sec", None), + }) + try: + return json.dumps(serialized, ensure_ascii=False) + except Exception: + return str(serialized) + +# ── ReAct step lifecycle tracked via contextvars ──────────────────────────── +# A STEP span stays open across `_handle_llm_interaction` ⇒ `_execute_commands` +# so both become its children. It is closed when the next iteration starts or +# when `_run_agent_loop` returns. +_current_step_span = contextvars.ContextVar( + "terminus2_current_step_span", default=None +) +_current_step_token = contextvars.ContextVar( + "terminus2_current_step_token", default=None +) +_react_round_counter = contextvars.ContextVar( + "terminus2_react_round_counter", default=0 +) + + +def _end_current_step(finish_reason: str | None = None) -> None: + """End the active ReAct STEP span (if any) and detach its context.""" + span = _current_step_span.get() + token = _current_step_token.get() + if span is not None: + if finish_reason: + span.set_attribute(_GEN_AI_REACT_FINISH_REASON, finish_reason) + span.end() + _current_step_span.set(None) + if token is not None: + context_api.detach(token) + _current_step_token.set(None) + + +def _infer_provider_name(model_name: str) -> str: + """Infer ``gen_ai.provider.name`` from a model identifier string.""" + if not model_name: + return "unknown" + lower = model_name.lower() + if any(k in lower for k in ("gpt", "o1-", "o3-", "o4-")): + return "openai" + if "claude" in lower or "anthropic" in lower: + return "anthropic" + if "gemini" in lower: + return "google" + if "llama" in lower or "meta" in lower: + return "meta" + if "mistral" in lower: + return "mistral" + if "qwen" in lower: + return "alibaba" + if "deepseek" in lower: + return "deepseek" + if "/" in model_name: + return model_name.split("/", 1)[0] + return "unknown" + + +# Sentinel attribute attached to every target we successfully wrap. Stored +# on the target callable itself (not in module-level state) so that +# duplicate wraps are detected even if this package is loaded as multiple +# module instances (e.g. wheel install + ``pip install -e`` source, or +# under different sys.path roots), or if ``_instrument()`` is invoked +# twice via auto-loader + manual call. +_TERMINUS2_MARKER = "_otel_terminus2_wrapped" + + +def _resolve_target(module: str, name: str): + """Resolve ``module.name`` (where ``name`` may be ``Class.method``). + + Returns ``(parent, attr_name, current_value)``. Raises on missing + module / attribute. + """ + from importlib import import_module + mod = import_module(module) + parts = name.split(".") + parent = mod + for p in parts[:-1]: + parent = getattr(parent, p) + attr = parts[-1] + return parent, attr, getattr(parent, attr, None) + + +def _try_wrap(module: str, name: str, wrapper) -> None: + """Wrap ``module.name`` with ``wrapper`` exactly once. + + Idempotency is enforced via a sentinel attribute attached to the + target — robust against multiple module instances of this package and + repeated ``_instrument()`` invocations. + """ + try: + parent, attr, current = _resolve_target(module, name) + except Exception as e: + logger.warning(f"Could not resolve {module}.{name}: {e}") + return + + if current is None: + logger.warning(f"{module}.{name} not found") + return + + if getattr(current, _TERMINUS2_MARKER, False): + logger.debug( + f"{module}.{name} already wrapped by terminus2 instrumentation, " + "skipping" + ) + return + + try: + wrap_function_wrapper(module=module, name=name, wrapper=wrapper) + except Exception as e: + logger.warning(f"Could not wrap {module}.{name}: {e}") + return + + # Mark the freshly installed wrapper. wrapt's FunctionWrapper proxies + # attribute writes to the underlying wrapped object, but reading the + # attribute back through the proxy returns the same value, so a + # subsequent ``getattr`` check on either layer detects the marker. + new_value = getattr(parent, attr, None) + if new_value is not None: + try: + setattr(new_value, _TERMINUS2_MARKER, True) + except Exception as e: + logger.debug(f"Could not mark {module}.{name}: {e}") + + +def _try_unwrap(module: str, name: str) -> None: + """Reverse of :func:`_try_wrap`.""" + try: + parent, attr, current = _resolve_target(module, name) + except Exception: + return + + if current is None or not getattr(current, _TERMINUS2_MARKER, False): + return + + # Clear the marker on the underlying object first (FunctionWrapper + # forwards delattr to the wrapped object, so the marker — which was + # written through to the original — is removed cleanly). + try: + delattr(current, _TERMINUS2_MARKER) + except (AttributeError, TypeError): + pass + + try: + unwrap(parent, attr) + except Exception as e: + logger.debug(f"Could not unwrap {module}.{name}: {e}") + + +# ═══════════════════════════════════════════════════════════════════════════ +# Instrumentor +# ═══════════════════════════════════════════════════════════════════════════ + +class Terminus2Instrumentor(BaseInstrumentor): + """Instrumentor for the terminus-2 agent from terminal-bench.""" + + def instrumentation_dependencies(self) -> Collection[str]: + return _instruments + + def _instrument(self, **kwargs: Any) -> None: + tracer_provider = kwargs.get("tracer_provider") + tracer = trace_api.get_tracer(__name__, "", tracer_provider=tracer_provider) + + # P0 – ENTRY span (application entry point) + _try_wrap( + "terminal_bench.agents.terminus_2.terminus_2", + "Terminus2.perform_task", + _PerformTaskWrapper(tracer), + ) + + # P0 – AGENT span (agent invocation) + ReAct loop lifecycle + _try_wrap( + "terminal_bench.agents.terminus_2.terminus_2", + "Terminus2._run_agent_loop", + _RunAgentLoopWrapper(tracer), + ) + + # NOTE: LLM spans for ``LiteLLM.call`` are NOT produced here — + # ``opentelemetry-instrumentation-litellm`` already traces the + # underlying ``litellm.completion`` invocation. Wrapping again would + # produce duplicate LLM spans for every model call. + + # P0 – TOOL span for terminal command batch + _try_wrap( + "terminal_bench.agents.terminus_2.terminus_2", + "Terminus2._execute_commands", + _ExecuteCommandsWrapper(tracer), + ) + + # P1 – STEP span per ReAct iteration + _try_wrap( + "terminal_bench.agents.terminus_2.terminus_2", + "Terminus2._handle_llm_interaction", + _HandleLLMInteractionWrapper(tracer), + ) + + # P1 – TASK span for parser (json + xml) + _try_wrap( + "terminal_bench.agents.terminus_2.terminus_json_plain_parser", + "TerminusJSONPlainParser.parse_response", + _ParseResponseWrapper(tracer, "json"), + ) + _try_wrap( + "terminal_bench.agents.terminus_2.terminus_xml_plain_parser", + "TerminusXMLPlainParser.parse_response", + _ParseResponseWrapper(tracer, "xml"), + ) + + # P2 – CHAIN span for context-overflow handoff + _try_wrap( + "terminal_bench.agents.terminus_2.terminus_2", + "Terminus2._summarize", + _SummarizeWrapper(tracer), + ) + + def _uninstrument(self, **kwargs: Any) -> None: + _try_unwrap( + "terminal_bench.agents.terminus_2.terminus_2", + "Terminus2.perform_task", + ) + _try_unwrap( + "terminal_bench.agents.terminus_2.terminus_2", + "Terminus2._run_agent_loop", + ) + _try_unwrap( + "terminal_bench.agents.terminus_2.terminus_2", + "Terminus2._execute_commands", + ) + _try_unwrap( + "terminal_bench.agents.terminus_2.terminus_2", + "Terminus2._handle_llm_interaction", + ) + _try_unwrap( + "terminal_bench.agents.terminus_2.terminus_json_plain_parser", + "TerminusJSONPlainParser.parse_response", + ) + _try_unwrap( + "terminal_bench.agents.terminus_2.terminus_xml_plain_parser", + "TerminusXMLPlainParser.parse_response", + ) + _try_unwrap( + "terminal_bench.agents.terminus_2.terminus_2", + "Terminus2._summarize", + ) + _end_current_step() + + +# ═══════════════════════════════════════════════════════════════════════════ +# P0 — ENTRY span: Terminus2.perform_task +# ═══════════════════════════════════════════════════════════════════════════ + +class _PerformTaskWrapper: + """Wrap ``Terminus2.perform_task`` to produce the **ENTRY** span. + + Per spec: span name ``enter_ai_application_system``, + ``gen_ai.span.kind=ENTRY``, ``gen_ai.operation.name=enter``. + + Records the user instruction as ``input.value`` and a serialized summary + of ``AgentResult`` (failure_mode, token totals, marker count) as + ``output.value`` once the task completes. + """ + + def __init__(self, tracer): + self._tracer = tracer + + @hook_advice( + instrumentation_name="terminus2", + advice_method="perform_task", + throw_exception=True, + ) + def __call__(self, wrapped, instance, args, kwargs): + model_name = getattr(instance, "_model_name", "unknown") + instruction = args[0] if args else kwargs.get("instruction", "") + + with self._tracer.start_as_current_span( + "enter_ai_application_system", + kind=SpanKind.SERVER, + ) as span: + span.set_attribute(CommonAttributes.GEN_AI_SPAN_KIND, _SPAN_KIND_ENTRY) + span.set_attribute(CommonAttributes.GEN_AI_OPERATION_NAME, _OP_ENTER) + span.set_attribute(CommonAttributes.GEN_AI_FRAMEWORK, _FRAMEWORK) + span.set_attribute(LLMAttributes.GEN_AI_REQUEST_MODEL, model_name) + span.set_attribute( + LLMAttributes.GEN_AI_PROVIDER_NAME, + _infer_provider_name(model_name), + ) + + if instruction: + span.set_attribute( + CommonAttributes.INPUT_VALUE, str(instruction) + ) + span.set_attribute(CommonAttributes.INPUT_MIME_TYPE, "text/plain") + + try: + result = wrapped(*args, **kwargs) + except Exception as e: + span.record_exception(e) + span.set_status(Status(StatusCode.ERROR)) + raise + + input_tokens = getattr(result, "total_input_tokens", 0) or 0 + output_tokens = getattr(result, "total_output_tokens", 0) or 0 + failure_mode = getattr(result, "failure_mode", None) + failure_mode_str = str( + getattr(failure_mode, "value", failure_mode) + ) if failure_mode is not None else "none" + markers = getattr(result, "timestamped_markers", None) or [] + + output_summary = { + "failure_mode": failure_mode_str, + "total_input_tokens": input_tokens, + "total_output_tokens": output_tokens, + "marker_count": len(markers), + } + try: + output_value = json.dumps(output_summary, ensure_ascii=False) + except Exception: + output_value = str(output_summary) + + span.set_attribute(CommonAttributes.OUTPUT_VALUE, output_value) + span.set_attribute(CommonAttributes.OUTPUT_MIME_TYPE, "application/json") + span.set_attribute("terminus2.failure_mode", failure_mode_str) + + span.set_status(Status(StatusCode.OK)) + return result + + +# ═══════════════════════════════════════════════════════════════════════════ +# P0 — AGENT span: Terminus2._run_agent_loop +# ═══════════════════════════════════════════════════════════════════════════ + +class _RunAgentLoopWrapper: + """Wrap ``Terminus2._run_agent_loop`` to produce the **AGENT** span. + + Per spec: span name ``invoke_agent {agent.name}``, + ``gen_ai.span.kind=AGENT``, ``gen_ai.operation.name=invoke_agent``. + + The AGENT span precisely brackets the ReAct loop body — STEP / TOOL / + TASK / CHAIN children all hang off it. Token totals are aggregated + from the ``Chat`` cumulative counters once the loop returns. Also + cleans up any trailing STEP span on loop exit. + """ + + def __init__(self, tracer): + self._tracer = tracer + + @hook_advice( + instrumentation_name="terminus2", + advice_method="run_agent_loop", + throw_exception=True, + ) + def __call__(self, wrapped, instance, args, kwargs): + # Reset per-loop ReAct state + _react_round_counter.set(0) + _end_current_step() + + model_name = getattr(instance, "_model_name", "unknown") + parser_name = getattr(instance, "_parser_name", "unknown") + + # _run_agent_loop signature: + # (initial_prompt, session, chat, logging_dir=None, + # original_instruction="") + chat = args[2] if len(args) > 2 else kwargs.get("chat") + original_instruction = ( + args[4] if len(args) > 4 else kwargs.get("original_instruction", "") + ) + + with self._tracer.start_as_current_span( + f"invoke_agent {_AGENT_NAME}", + kind=SpanKind.INTERNAL, + ) as span: + span.set_attribute( + CommonAttributes.GEN_AI_SPAN_KIND, GenAiSpanKind.AGENT.value + ) + span.set_attribute( + CommonAttributes.GEN_AI_OPERATION_NAME, + GenAiOperationName.INVOKE_AGENT.value, + ) + span.set_attribute(CommonAttributes.GEN_AI_FRAMEWORK, _FRAMEWORK) + span.set_attribute("gen_ai.agent.name", _AGENT_NAME) + span.set_attribute( + "gen_ai.agent.description", + "Terminus-2 terminal-bench agent (ReAct loop over a tmux session)", + ) + span.set_attribute(LLMAttributes.GEN_AI_REQUEST_MODEL, model_name) + span.set_attribute( + LLMAttributes.GEN_AI_PROVIDER_NAME, + _infer_provider_name(model_name), + ) + span.set_attribute("terminus2.parser", parser_name) + + if original_instruction: + span.set_attribute( + CommonAttributes.INPUT_VALUE, + str(original_instruction), + ) + span.set_attribute(CommonAttributes.INPUT_MIME_TYPE, "text/plain") + + try: + result = wrapped(*args, **kwargs) + except Exception as e: + span.record_exception(e) + span.set_status(Status(StatusCode.ERROR)) + _end_current_step(finish_reason="loop_end") + raise + + _end_current_step(finish_reason="loop_end") + + # Aggregate token usage from the Chat object — captured here so + # the totals reflect the full loop, including the bare + # ``chat._model.call`` invoked inside ``_summarize``. + # ``Chat.total_*_tokens`` returns cumulative counters that + # survive context unwinding. + if chat is not None: + input_tokens = getattr(chat, "total_input_tokens", 0) or 0 + output_tokens = getattr(chat, "total_output_tokens", 0) or 0 + span.set_attribute( + LLMAttributes.GEN_AI_USAGE_INPUT_TOKENS, input_tokens + ) + span.set_attribute( + LLMAttributes.GEN_AI_USAGE_OUTPUT_TOKENS, output_tokens + ) + span.set_attribute( + LLMAttributes.GEN_AI_USAGE_TOTAL_TOKENS, + input_tokens + output_tokens, + ) + + span.set_attribute( + "terminus2.react.rounds", _react_round_counter.get() + ) + + span.set_status(Status(StatusCode.OK)) + return result + + +# ═══════════════════════════════════════════════════════════════════════════ +# P0 — TOOL span: Terminus2._execute_commands +# ═══════════════════════════════════════════════════════════════════════════ + +class _ExecuteCommandsWrapper: + """Wrap ``Terminus2._execute_commands`` to produce a **TOOL** span. + + Per spec: span name ``execute_tool {tool_name}``, + ``gen_ai.span.kind=TOOL``, ``gen_ai.operation.name=execute_tool``. + """ + + def __init__(self, tracer): + self._tracer = tracer + + @hook_advice( + instrumentation_name="terminus2", + advice_method="execute_commands", + throw_exception=True, + ) + def __call__(self, wrapped, instance, args, kwargs): + commands = args[0] if args else kwargs.get("commands", []) + + with self._tracer.start_as_current_span( + f"execute_tool {_TERMINAL_TOOL_NAME}", + kind=SpanKind.INTERNAL, + ) as span: + span.set_attribute( + CommonAttributes.GEN_AI_SPAN_KIND, GenAiSpanKind.TOOL.value + ) + span.set_attribute( + CommonAttributes.GEN_AI_OPERATION_NAME, + GenAiOperationName.EXECUTE_TOOL.value, + ) + span.set_attribute(CommonAttributes.GEN_AI_FRAMEWORK, _FRAMEWORK) + span.set_attribute(ToolAttributes.GEN_AI_TOOL_NAME, _TERMINAL_TOOL_NAME) + span.set_attribute( + ToolAttributes.GEN_AI_TOOL_DESCRIPTION, _TERMINAL_TOOL_DESCRIPTION + ) + span.set_attribute( + ToolAttributes.GEN_AI_TOOL_TYPE, GenAiToolType.EXTENSION.value + ) + span.set_attribute("terminus2.commands.count", len(commands)) + + arguments_json = _commands_to_arguments_json(commands) + # Spec attribute (gen-ai.md §Tool) + span.set_attribute(_GEN_AI_TOOL_CALL_ARGUMENTS, arguments_json) + # Common input.value mirror — many viewers only render this + span.set_attribute(CommonAttributes.INPUT_VALUE, arguments_json) + span.set_attribute(CommonAttributes.INPUT_MIME_TYPE, "application/json") + + try: + result = wrapped(*args, **kwargs) + except Exception as e: + span.record_exception(e) + span.set_status(Status(StatusCode.ERROR)) + raise + + timeout_occurred, terminal_output = result + span.set_attribute("terminus2.terminal.timeout", timeout_occurred) + + if terminal_output is not None: + output_text = str(terminal_output) + # Spec attribute (gen-ai.md §Tool) + span.set_attribute(_GEN_AI_TOOL_CALL_RESULT, output_text) + # Common output.value mirror + span.set_attribute(CommonAttributes.OUTPUT_VALUE, output_text) + span.set_attribute(CommonAttributes.OUTPUT_MIME_TYPE, "text/plain") + + span.set_status(Status(StatusCode.OK)) + return result + + +# ═══════════════════════════════════════════════════════════════════════════ +# P1 — STEP span: Terminus2._handle_llm_interaction +# ═══════════════════════════════════════════════════════════════════════════ + +class _HandleLLMInteractionWrapper: + """Wrap ``Terminus2._handle_llm_interaction`` to produce a **STEP** span. + + The STEP span represents one ReAct iteration. It opens here, stays open + after this method returns (so the subsequent ``_execute_commands`` call + in ``_run_agent_loop`` becomes its child), and is closed on the next + iteration entry or by ``_RunAgentLoopWrapper`` cleanup. + """ + + def __init__(self, tracer): + self._tracer = tracer + + @hook_advice( + instrumentation_name="terminus2", + advice_method="handle_llm_interaction", + throw_exception=True, + ) + def __call__(self, wrapped, instance, args, kwargs): + # Close previous STEP first (if any) + _end_current_step(finish_reason="next_round") + + round_num = _react_round_counter.get() + 1 + _react_round_counter.set(round_num) + + step_span = self._tracer.start_span( + "react step", + kind=SpanKind.INTERNAL, + ) + step_span.set_attribute(CommonAttributes.GEN_AI_SPAN_KIND, _SPAN_KIND_STEP) + step_span.set_attribute(CommonAttributes.GEN_AI_OPERATION_NAME, _OP_REACT) + step_span.set_attribute(CommonAttributes.GEN_AI_FRAMEWORK, _FRAMEWORK) + step_span.set_attribute(_GEN_AI_REACT_ROUND, round_num) + + ctx = trace_api.set_span_in_context(step_span) + token = context_api.attach(ctx) + _current_step_span.set(step_span) + _current_step_token.set(token) + + try: + result = wrapped(*args, **kwargs) + except Exception as e: + step_span.set_attribute(_GEN_AI_REACT_FINISH_REASON, "error") + step_span.record_exception(e) + step_span.set_status(Status(StatusCode.ERROR)) + raise + + commands, is_task_complete, feedback = result + + if is_task_complete: + step_span.set_attribute(_GEN_AI_REACT_FINISH_REASON, "complete") + elif feedback and "ERROR:" in feedback: + step_span.set_attribute(_GEN_AI_REACT_FINISH_REASON, "parse_error") + + # Span stays open: closed by next iteration or _RunAgentLoopWrapper + return result + + +# ═══════════════════════════════════════════════════════════════════════════ +# P1 — TASK span: parser.parse_response +# ═══════════════════════════════════════════════════════════════════════════ + +class _ParseResponseWrapper: + """Wrap ``parser.parse_response`` to produce a **TASK** span. + + Per spec: span name ``run_task {task_name}``, + ``gen_ai.span.kind=TASK``, ``gen_ai.operation.name=run_task``. + """ + + def __init__(self, tracer, parser_type): + self._tracer = tracer + self._parser_type = parser_type + + @hook_advice( + instrumentation_name="terminus2", + advice_method="parse_response", + throw_exception=True, + ) + def __call__(self, wrapped, instance, args, kwargs): + # parse_response signature: (self, response: str) + response_text = args[0] if args else kwargs.get("response", "") + + with self._tracer.start_as_current_span( + "run_task parse_response", + kind=SpanKind.INTERNAL, + ) as span: + span.set_attribute( + CommonAttributes.GEN_AI_SPAN_KIND, GenAiSpanKind.TASK.value + ) + span.set_attribute(CommonAttributes.GEN_AI_OPERATION_NAME, _OP_RUN_TASK) + span.set_attribute(CommonAttributes.GEN_AI_FRAMEWORK, _FRAMEWORK) + span.set_attribute("terminus2.parser", self._parser_type) + + if response_text is not None: + span.set_attribute( + CommonAttributes.INPUT_VALUE, str(response_text) + ) + span.set_attribute(CommonAttributes.INPUT_MIME_TYPE, "text/plain") + + try: + result = wrapped(*args, **kwargs) + except Exception as e: + span.record_exception(e) + span.set_status(Status(StatusCode.ERROR)) + raise + + span.set_attribute("terminus2.task_complete", result.is_task_complete) + span.set_attribute("terminus2.commands.count", len(result.commands)) + + output_summary = { + "is_task_complete": result.is_task_complete, + "commands": [ + { + "keystrokes": getattr(c, "keystrokes", ""), + "duration": getattr(c, "duration", None), + } + for c in result.commands + ], + "error": result.error or "", + "warning": result.warning or "", + } + try: + output_value = json.dumps(output_summary, ensure_ascii=False) + except Exception: + output_value = str(output_summary) + span.set_attribute(CommonAttributes.OUTPUT_VALUE, output_value) + span.set_attribute(CommonAttributes.OUTPUT_MIME_TYPE, "application/json") + + if result.error: + span.set_attribute("terminus2.parse.error", str(result.error)) + + if result.warning: + span.set_attribute("terminus2.parse.warning", str(result.warning)) + + span.set_status(Status(StatusCode.OK)) + return result + + +# ═══════════════════════════════════════════════════════════════════════════ +# P2 — CHAIN span: Terminus2._summarize +# ═══════════════════════════════════════════════════════════════════════════ + +class _SummarizeWrapper: + """Wrap ``Terminus2._summarize`` to produce a **CHAIN** span. + + Per spec: span name ``chain {chain_name}``, + ``gen_ai.span.kind=CHAIN``. The summarize handoff itself triggers + multiple inner LLM calls so it semantically maps to a Chain. + """ + + def __init__(self, tracer): + self._tracer = tracer + + @hook_advice( + instrumentation_name="terminus2", + advice_method="summarize", + throw_exception=True, + ) + def __call__(self, wrapped, instance, args, kwargs): + with self._tracer.start_as_current_span( + "chain summarize", + kind=SpanKind.INTERNAL, + ) as span: + span.set_attribute( + CommonAttributes.GEN_AI_SPAN_KIND, GenAiSpanKind.CHAIN.value + ) + span.set_attribute(CommonAttributes.GEN_AI_OPERATION_NAME, _OP_TASK) + span.set_attribute(CommonAttributes.GEN_AI_FRAMEWORK, _FRAMEWORK) + + try: + result = wrapped(*args, **kwargs) + except Exception as e: + span.record_exception(e) + span.set_status(Status(StatusCode.ERROR)) + raise + + span.set_status(Status(StatusCode.OK)) + return result diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/package.py new file mode 100644 index 000000000..d92c81333 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/package.py @@ -0,0 +1,15 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +_instruments = ("terminal-bench >= 0.1.0",) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/version.py new file mode 100644 index 000000000..5fd301e2e --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/version.py @@ -0,0 +1,15 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__version__ = "0.1.0" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/test-requirements.txt b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/test-requirements.txt new file mode 100644 index 000000000..f98537dd8 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/test-requirements.txt @@ -0,0 +1,4 @@ +terminal-bench>=0.1.0 +-e aliyun-semantic-conventions +-e util/opentelemetry-util-http +-e instrumentation-loongsuite/loongsuite-instrumentation-terminus2 diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/README.md b/instrumentation-loongsuite/loongsuite-instrumentation-vita/README.md new file mode 100644 index 000000000..a722e267c --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/README.md @@ -0,0 +1,17 @@ +# LoongSuite VitaBench Instrumentation + +OpenTelemetry instrumentation for the VitaBench multi-domain simulation framework. + +## Installation + +```bash +pip install loongsuite-instrumentation-vita +``` + +## Usage + +```python +from opentelemetry.instrumentation.vita import VitaInstrumentor + +VitaInstrumentor().instrument() +``` diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-vita/pyproject.toml new file mode 100644 index 000000000..d1df8fa2e --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/pyproject.toml @@ -0,0 +1,55 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "loongsuite-instrumentation-vita" +dynamic = ["version"] +description = "LoongSuite VitaBench instrumentation" +readme = "README.md" +license = "Apache-2.0" +requires-python = ">=3.10,<4" +authors = [ + { name = "Zhiyong Liu", email = "liuzhiyong.lzy@alibaba-inc.com" }, + { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +dependencies = [ + "opentelemetry-api >= 1.37.0", + "opentelemetry-instrumentation >= 0.58b0", + "opentelemetry-semantic-conventions >= 0.58b0", + "wrapt >= 1.0.0, < 2.0.0", + "opentelemetry-util-genai >= 0.3b0.dev0", +] + +[project.optional-dependencies] +instruments = [ + "vita >= 0.0.1", +] + +[project.entry-points.opentelemetry_instrumentor] +vita = "opentelemetry.instrumentation.vita:VitaInstrumentor" + +[project.urls] +Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-vita" +Repository = "https://github.com/alibaba/loongsuite-python-agent" + +[tool.hatch.version] +path = "src/opentelemetry/instrumentation/vita/version.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/opentelemetry"] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/__init__.py new file mode 100644 index 000000000..1e58668a6 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/__init__.py @@ -0,0 +1,223 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +OpenTelemetry VitaBench Instrumentation + +Usage +----- +.. code:: python + + from opentelemetry.instrumentation.vita import VitaInstrumentor + + VitaInstrumentor().instrument() + + # ... run vitabench tasks ... + + VitaInstrumentor().uninstrument() + +API +--- +""" + +from __future__ import annotations + +import logging +from typing import Any, Collection + +from wrapt import wrap_function_wrapper + +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor +from opentelemetry.instrumentation.utils import unwrap +from opentelemetry.instrumentation.vita.package import _instruments +from opentelemetry.instrumentation.vita.patch import ( + wrap_generate, + wrap_generate_next_message, + wrap_get_response, + wrap_orchestrator_run, + wrap_orchestrator_step, + wrap_run_task, +) +from opentelemetry.instrumentation.vita.version import __version__ +from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler + +logger = logging.getLogger(__name__) + +__all__ = ["VitaInstrumentor", "__version__"] + + +class VitaInstrumentor(BaseInstrumentor): + """OpenTelemetry instrumentor for VitaBench framework. + + Instruments the following components: + - vita.run.run_task(): Entry spans (ENTRY) + - Orchestrator.run(): Workflow spans (CHAIN) + - Orchestrator.step(): ReAct step spans (STEP) + - LLMAgent.generate_next_message(): Agent spans (AGENT) + - generate(): LLM call spans (LLM) + - Environment.get_response(): Tool execution spans (TOOL) + """ + + def __init__(self): + super().__init__() + self._handler = None + + def instrumentation_dependencies(self) -> Collection[str]: + return _instruments + + def _instrument(self, **kwargs: Any) -> None: + """Enable VitaBench instrumentation.""" + tracer_provider = kwargs.get("tracer_provider") + meter_provider = kwargs.get("meter_provider") + logger_provider = kwargs.get("logger_provider") + + self._handler = ExtendedTelemetryHandler( + tracer_provider=tracer_provider, + meter_provider=meter_provider, + logger_provider=logger_provider, + ) + + # Hook #5: generate -> LLM. Wrap this first so modules that import + # generate directly (for example vita.agent.llm_agent) bind to the + # instrumented function during their import. + try: + wrap_function_wrapper( + module="vita.utils.llm_utils", + name="generate", + wrapper=lambda w, i, a, k: wrap_generate( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented vita.utils.llm_utils.generate") + except Exception as e: + logger.warning(f"Could not wrap vita.utils.llm_utils.generate: {e}") + + # Hook #1: run_task -> ENTRY + try: + wrap_function_wrapper( + module="vita.run", + name="run_task", + wrapper=lambda w, i, a, k: wrap_run_task( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented vita.run.run_task") + except Exception as e: + logger.warning(f"Could not wrap vita.run.run_task: {e}") + + # Hook #2: Orchestrator.run -> CHAIN + try: + wrap_function_wrapper( + module="vita.orchestrator.orchestrator", + name="Orchestrator.run", + wrapper=lambda w, i, a, k: wrap_orchestrator_run( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented Orchestrator.run") + except Exception as e: + logger.warning(f"Could not wrap Orchestrator.run: {e}") + + # Hook #3: Orchestrator.step -> STEP + try: + wrap_function_wrapper( + module="vita.orchestrator.orchestrator", + name="Orchestrator.step", + wrapper=lambda w, i, a, k: wrap_orchestrator_step( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented Orchestrator.step") + except Exception as e: + logger.warning(f"Could not wrap Orchestrator.step: {e}") + + # Hook #4a: LLMAgent.generate_next_message -> AGENT + try: + wrap_function_wrapper( + module="vita.agent.llm_agent", + name="LLMAgent.generate_next_message", + wrapper=lambda w, i, a, k: wrap_generate_next_message( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented LLMAgent.generate_next_message") + except Exception as e: + logger.warning(f"Could not wrap LLMAgent.generate_next_message: {e}") + + # Hook #4b: LLMSoloAgent.generate_next_message -> AGENT + try: + wrap_function_wrapper( + module="vita.agent.llm_agent", + name="LLMSoloAgent.generate_next_message", + wrapper=lambda w, i, a, k: wrap_generate_next_message( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented LLMSoloAgent.generate_next_message") + except Exception as e: + logger.warning(f"Could not wrap LLMSoloAgent.generate_next_message: {e}") + + # Hook #6: Environment.get_response -> TOOL + try: + wrap_function_wrapper( + module="vita.environment.environment", + name="Environment.get_response", + wrapper=lambda w, i, a, k: wrap_get_response( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented Environment.get_response") + except Exception as e: + logger.warning(f"Could not wrap Environment.get_response: {e}") + + def _uninstrument(self, **kwargs: Any) -> None: + """Disable VitaBench instrumentation.""" + try: + import vita.run # noqa: PLC0415 + + unwrap(vita.run, "run_task") + except Exception as e: + logger.debug(f"Failed to uninstrument vita.run.run_task: {e}") + + try: + import vita.orchestrator.orchestrator # noqa: PLC0415 + + unwrap(vita.orchestrator.orchestrator.Orchestrator, "run") + unwrap(vita.orchestrator.orchestrator.Orchestrator, "step") + except Exception as e: + logger.debug(f"Failed to uninstrument Orchestrator: {e}") + + try: + import vita.agent.llm_agent # noqa: PLC0415 + + unwrap(vita.agent.llm_agent.LLMAgent, "generate_next_message") + unwrap(vita.agent.llm_agent.LLMSoloAgent, "generate_next_message") + except Exception as e: + logger.debug(f"Failed to uninstrument LLMAgent: {e}") + + try: + import vita.utils.llm_utils # noqa: PLC0415 + + unwrap(vita.utils.llm_utils, "generate") + except Exception as e: + logger.debug(f"Failed to uninstrument generate: {e}") + + try: + import vita.environment.environment # noqa: PLC0415 + + unwrap(vita.environment.environment.Environment, "get_response") + except Exception as e: + logger.debug(f"Failed to uninstrument Environment: {e}") + + self._handler = None diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/package.py new file mode 100644 index 000000000..a776722c9 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/package.py @@ -0,0 +1,3 @@ +_instruments = ("vita >= 0.0.1",) + +_supports_metrics = False diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/patch.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/patch.py new file mode 100644 index 000000000..182da38d6 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/patch.py @@ -0,0 +1,432 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Patch functions for VitaBench instrumentation. + +Wraps key vitabench methods to generate OpenTelemetry spans: +- run_task() -> ENTRY spans +- Orchestrator.run() -> CHAIN spans +- Orchestrator.step() -> STEP spans (react) +- LLMAgent.generate_next_message() -> AGENT spans +- generate() -> LLM spans +- Environment.get_response() -> TOOL spans +""" + +from __future__ import annotations + +import json +import logging +import uuid +from contextvars import ContextVar +from typing import Any, Optional + +from opentelemetry import trace as trace_api +from opentelemetry.trace import SpanKind, Status, StatusCode +from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler +from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes +from opentelemetry.util.genai.extended_types import ( + EntryInvocation, + ExecuteToolInvocation, + InvokeAgentInvocation, + ReactStepInvocation, +) +from opentelemetry.util.genai.types import ( + Error, + InputMessage, + LLMInvocation, + OutputMessage, + Text, +) + +from .utils import ( + _convert_vita_assistant_to_output, + _convert_vita_messages_to_input, + _get_tool_definitions, + _infer_provider, + _MAX_CONTENT_LEN, +) + +logger = logging.getLogger(__name__) + +# ContextVars for ReAct step tracking +_react_step_invocation: ContextVar[Optional[ReactStepInvocation]] = ContextVar( + "vita_react_step_invocation", default=None +) +_react_step_counter: ContextVar[int] = ContextVar( + "vita_react_step_counter", default=0 +) + +# Reentrancy guard for AGENT span (LLMSoloAgent extends LLMAgent) +_in_agent_invoke: ContextVar[bool] = ContextVar( + "vita_in_agent_invoke", default=False +) + + +def _close_active_react_step(handler: ExtendedTelemetryHandler) -> None: + """Close the currently active react_step span, if any.""" + prev = _react_step_invocation.get() + if prev is not None: + try: + handler.stop_react_step(prev) + except Exception as e: + logger.debug(f"Failed to close react step: {e}") + _react_step_invocation.set(None) + + +# ==================== Hook #1: run_task -> ENTRY ==================== + + +def wrap_run_task( + wrapped, instance, args, kwargs, handler: ExtendedTelemetryHandler +): + """Wrapper for vita.run.run_task to create ENTRY span.""" + task = args[1] if len(args) > 1 else kwargs.get("task") + domain = args[0] if args else kwargs.get("domain") + + invocation = EntryInvocation( + session_id=str(uuid.uuid4()), + user_id=None, + ) + invocation.attributes["gen_ai.framework"] = "vitabench" + + if task and hasattr(task, "instructions") and task.instructions: + invocation.input_messages = [ + InputMessage(role="user", parts=[Text(content=str(task.instructions)[:_MAX_CONTENT_LEN])]) + ] + + handler.start_entry(invocation) + try: + result = wrapped(*args, **kwargs) + + if result: + output_parts = [] + if hasattr(result, "termination_reason") and result.termination_reason: + output_parts.append(Text(content=f"termination: {result.termination_reason}")) + if hasattr(result, "reward_info") and result.reward_info: + reward = getattr(result.reward_info, "reward", None) + if reward is not None: + output_parts.append(Text(content=f"reward: {reward}")) + if output_parts: + invocation.output_messages = [ + OutputMessage( + role="assistant", + parts=output_parts, + finish_reason="stop", + ) + ] + + handler.stop_entry(invocation) + return result + except Exception as e: + handler.fail_entry(invocation, Error(message=str(e), type=type(e))) + raise + + +# ==================== Hook #2: Orchestrator.run -> CHAIN ==================== + + +def wrap_orchestrator_run( + wrapped, instance, args, kwargs, handler: ExtendedTelemetryHandler +): + """Wrapper for Orchestrator.run to create CHAIN span.""" + task = getattr(instance, "task", None) + domain = getattr(instance, "domain", "unknown") + span_name = f"workflow {domain}" + + input_text = "" + if task and hasattr(task, "instructions") and task.instructions: + input_text = str(task.instructions)[:_MAX_CONTENT_LEN] + + tracer = handler._tracer + + # Reset step counter for this orchestrator run + counter_token = _react_step_counter.set(0) + step_token = _react_step_invocation.set(None) + + with tracer.start_as_current_span( + name=span_name, + kind=SpanKind.INTERNAL, + attributes={ + "gen_ai.operation.name": "workflow", + "gen_ai.system": "vitabench", + gen_ai_extended_attributes.GEN_AI_SPAN_KIND: "CHAIN", + "gen_ai.framework": "vitabench", + }, + ) as span: + if input_text: + span.set_attribute("input.value", input_text) + + try: + result = wrapped(*args, **kwargs) + + # Close any remaining open step span + _close_active_react_step(handler) + + if result and hasattr(result, "termination_reason") and result.termination_reason: + span.set_attribute("output.value", str(result.termination_reason)) + + span.set_status(Status(StatusCode.OK)) + return result + except Exception as e: + # Close any remaining open step span + _close_active_react_step(handler) + span.record_exception(e) + span.set_status(Status(StatusCode.ERROR)) + raise + finally: + _react_step_counter.reset(counter_token) + _react_step_invocation.reset(step_token) + + +# ==================== Hook #3: Orchestrator.step -> STEP ==================== + + +def wrap_orchestrator_step( + wrapped, instance, args, kwargs, handler: ExtendedTelemetryHandler +): + """Wrapper for Orchestrator.step to create STEP span on AGENT turns.""" + to_role = getattr(instance, "to_role", None) + + # Import Role enum dynamically to avoid import-time dependency + _Role = None + try: + from vita.orchestrator.orchestrator import Role + _Role = Role + except ImportError: + pass + + is_agent_turn = False + if _Role is not None: + is_agent_turn = (to_role == _Role.AGENT) + else: + is_agent_turn = (str(to_role) == "Role.AGENT" or str(to_role) == "agent") + + if is_agent_turn: + # Close previous STEP span (deferred close strategy) + _close_active_react_step(handler) + + step_num = _react_step_counter.get() + 1 + _react_step_counter.set(step_num) + + step_inv = ReactStepInvocation(round=step_num) + handler.start_react_step(step_inv) + _react_step_invocation.set(step_inv) + + try: + result = wrapped(*args, **kwargs) + + if is_agent_turn: + current_step = _react_step_invocation.get() + if current_step: + done = getattr(instance, "done", False) + if done: + term_reason = getattr(instance, "termination_reason", None) + if term_reason: + current_step.finish_reason = ( + term_reason.value + if hasattr(term_reason, "value") + else str(term_reason) + ) + else: + current_step.finish_reason = "agent_stop" + else: + message = getattr(instance, "message", None) + if message and hasattr(message, "is_tool_call") and message.is_tool_call(): + current_step.finish_reason = "tool_call" + else: + current_step.finish_reason = "assistant_text" + + return result + except Exception as e: + current_step = _react_step_invocation.get() + if current_step: + current_step.finish_reason = "error" + handler.fail_react_step(current_step, Error(message=str(e), type=type(e))) + _react_step_invocation.set(None) + raise + + +# ==================== Hook #4: generate_next_message -> AGENT ==================== + + +def wrap_generate_next_message( + wrapped, instance, args, kwargs, handler: ExtendedTelemetryHandler +): + """Wrapper for LLMAgent.generate_next_message / LLMSoloAgent.generate_next_message.""" + # Reentrancy guard + if _in_agent_invoke.get(): + return wrapped(*args, **kwargs) + token = _in_agent_invoke.set(True) + + try: + agent_name = instance.__class__.__name__ + model = getattr(instance, "llm", None) + + invocation = InvokeAgentInvocation( + provider="vitabench", + agent_name=agent_name, + request_model=model, + ) + + # input_messages + message = args[0] if args else kwargs.get("message") + state = args[1] if len(args) > 1 else kwargs.get("state") + if message: + invocation.input_messages = _convert_vita_messages_to_input([message]) + + # system_instruction + if state and hasattr(state, "system_messages") and state.system_messages: + invocation.system_instruction = [ + Text(content=str(sm.content)[:_MAX_CONTENT_LEN]) + for sm in state.system_messages + if sm and getattr(sm, "content", None) + ] + + # tool_definitions + tools = getattr(instance, "tools", None) + tool_defs = _get_tool_definitions(tools) + if tool_defs: + invocation.tool_definitions = tool_defs + + handler.start_invoke_agent(invocation) + + try: + result = wrapped(*args, **kwargs) + assistant_msg, _ = result + + # output_messages + invocation.output_messages = _convert_vita_assistant_to_output(assistant_msg) + + # token usage + usage = getattr(assistant_msg, "usage", None) + if usage and isinstance(usage, dict): + invocation.input_tokens = usage.get("prompt_tokens") + invocation.output_tokens = usage.get("completion_tokens") + + handler.stop_invoke_agent(invocation) + return result + except Exception as e: + handler.fail_invoke_agent(invocation, Error(message=str(e), type=type(e))) + raise + finally: + _in_agent_invoke.reset(token) + + +# ==================== Hook #5: generate -> LLM ==================== + + +def wrap_generate( + wrapped, instance, args, kwargs, handler: ExtendedTelemetryHandler +): + """Wrapper for vita.utils.llm_utils.generate to create LLM span.""" + model = args[0] if args else kwargs.get("model", "unknown") + messages = args[1] if len(args) > 1 else kwargs.get("messages", []) + tools = args[2] if len(args) > 2 else kwargs.get("tools") + temperature = kwargs.get("temperature") + + invocation = LLMInvocation( + request_model=model or "unknown", + provider=_infer_provider(model or ""), + temperature=temperature, + ) + invocation.max_tokens = kwargs.get("max_tokens") + + # input_messages + invocation.input_messages = _convert_vita_messages_to_input(messages) + + # tool_definitions + tool_defs = _get_tool_definitions(tools) + if tool_defs: + invocation.tool_definitions = tool_defs + + handler.start_llm(invocation) + + try: + result = wrapped(*args, **kwargs) + + if result: + # output_messages + invocation.output_messages = _convert_vita_assistant_to_output(result) + + # response_model_name + invocation.response_model_name = model + + # finish_reasons + if getattr(result, "tool_calls", None): + invocation.finish_reasons = ["tool_calls"] + else: + invocation.finish_reasons = ["stop"] + + # token usage + usage = getattr(result, "usage", None) + if usage and isinstance(usage, dict): + invocation.input_tokens = usage.get("prompt_tokens") + invocation.output_tokens = usage.get("completion_tokens") + + handler.stop_llm(invocation) + return result + except Exception as e: + handler.fail_llm(invocation, Error(message=str(e), type=type(e))) + raise + + +# ==================== Hook #6: Environment.get_response -> TOOL ==================== + + +def wrap_get_response( + wrapped, instance, args, kwargs, handler: ExtendedTelemetryHandler +): + """Wrapper for Environment.get_response to create TOOL span.""" + message = args[0] if args else kwargs.get("message") + + tool_name = getattr(message, "name", "unknown") if message else "unknown" + tool_call_id = getattr(message, "id", None) if message else None + + invocation = ExecuteToolInvocation( + tool_name=tool_name, + tool_call_id=tool_call_id, + provider="vitabench", + ) + + # tool_call_arguments + if message and hasattr(message, "arguments") and message.arguments: + try: + invocation.tool_call_arguments = json.dumps( + message.arguments, ensure_ascii=False, default=str + )[:_MAX_CONTENT_LEN] + except Exception: + invocation.tool_call_arguments = str(message.arguments)[:_MAX_CONTENT_LEN] + + handler.start_execute_tool(invocation) + + try: + result = wrapped(*args, **kwargs) + + # tool_call_result + if result and getattr(result, "content", None): + invocation.tool_call_result = str(result.content)[:_MAX_CONTENT_LEN] + + # Check if tool reported an error + if result and getattr(result, "error", False): + handler.fail_execute_tool( + invocation, + Error(message=f"Tool error: {getattr(result, 'content', '')}", type=RuntimeError), + ) + else: + handler.stop_execute_tool(invocation) + + return result + except Exception as e: + handler.fail_execute_tool(invocation, Error(message=str(e), type=type(e))) + raise diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/utils.py new file mode 100644 index 000000000..0793a6cc0 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/utils.py @@ -0,0 +1,169 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utility functions for VitaBench instrumentation. + +Handles conversion between vitabench Message types and +OpenTelemetry GenAI semantic convention types. +""" + +from __future__ import annotations + +import json +import logging +from typing import Any, List, Optional + +from opentelemetry.util.genai.types import ( + FunctionToolDefinition, + InputMessage, + OutputMessage, + Text, + ToolCall as OTelToolCall, + ToolCallResponse, +) + +logger = logging.getLogger(__name__) + +_MAX_CONTENT_LEN = 4096 + + +def _convert_vita_messages_to_input(messages: Any) -> List[InputMessage]: + """Convert vita Message list to OTel InputMessage list.""" + if not messages: + return [] + + if not isinstance(messages, list): + messages = [messages] + + result = [] + for msg in messages: + try: + role = getattr(msg, "role", None) + if role is None: + continue + + parts = [] + content = getattr(msg, "content", None) + tool_calls = getattr(msg, "tool_calls", None) + + if role == "tool": + msg_id = getattr(msg, "id", None) or "" + if content: + parts.append( + ToolCallResponse( + id=msg_id, + response=str(content)[:_MAX_CONTENT_LEN], + ) + ) + else: + if content: + parts.append(Text(content=str(content)[:_MAX_CONTENT_LEN])) + if tool_calls: + for tc in tool_calls: + tc_args = getattr(tc, "arguments", {}) + if isinstance(tc_args, dict): + tc_args = json.dumps(tc_args, ensure_ascii=False, default=str) + parts.append( + OTelToolCall( + name=getattr(tc, "name", ""), + id=getattr(tc, "id", None), + arguments=tc_args, + ) + ) + + if parts: + result.append(InputMessage(role=role, parts=parts)) + except Exception as e: + logger.debug(f"Error converting vita message: {e}") + continue + + return result + + +def _convert_vita_assistant_to_output(msg: Any) -> List[OutputMessage]: + """Convert vita AssistantMessage to OTel OutputMessage list.""" + if not msg: + return [] + + parts = [] + content = getattr(msg, "content", None) + tool_calls = getattr(msg, "tool_calls", None) + + if content: + parts.append(Text(content=str(content)[:_MAX_CONTENT_LEN])) + if tool_calls: + for tc in tool_calls: + tc_args = getattr(tc, "arguments", {}) + if isinstance(tc_args, dict): + tc_args = json.dumps(tc_args, ensure_ascii=False, default=str) + parts.append( + OTelToolCall( + name=getattr(tc, "name", ""), + id=getattr(tc, "id", None), + arguments=tc_args, + ) + ) + + finish_reason = "tool_calls" if tool_calls else "stop" + + if not parts: + parts.append(Text(content="")) + + return [OutputMessage(role="assistant", parts=parts, finish_reason=finish_reason)] + + +def _infer_provider(model_name: str) -> str: + """Infer provider from model name string.""" + if not model_name: + return "unknown" + m = model_name.lower() + if "gpt" in m or "o1" in m or "o3" in m: + return "openai" + if "claude" in m: + return "anthropic" + if "qwen" in m: + return "alibaba_cloud" + if "deepseek" in m: + return "deepseek" + if "gemini" in m: + return "google" + return "unknown" + + +def _get_tool_definitions(tools: Any) -> Optional[List[FunctionToolDefinition]]: + """Extract tool definitions from vita Tool list.""" + if not tools: + return None + + try: + defs = [] + for t in tools: + name = getattr(t, "name", None) + if not name: + continue + parameters = None + openai_schema = getattr(t, "openai_schema", None) + if isinstance(openai_schema, dict): + function_schema = openai_schema.get("function", openai_schema) + parameters = function_schema.get("parameters") + defs.append( + FunctionToolDefinition( + name=name, + description=getattr(t, "short_desc", None), + parameters=parameters, + ) + ) + return defs if defs else None + except Exception: + return None diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/version.py new file mode 100644 index 000000000..26056b5d8 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/version.py @@ -0,0 +1 @@ +__version__ = "0.5.0.dev" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/conftest.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/conftest.py new file mode 100644 index 000000000..0d2ab7221 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/conftest.py @@ -0,0 +1,99 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Test configuration for VitaBench instrumentation tests.""" + +import os + +import pytest + +from opentelemetry.instrumentation.vita import VitaInstrumentor +from opentelemetry.sdk._logs import LoggerProvider +from opentelemetry.sdk._logs.export import ( + InMemoryLogExporter, + SimpleLogRecordProcessor, +) +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics.export import InMemoryMetricReader +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import SimpleSpanProcessor +from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( + InMemorySpanExporter, +) + + +def pytest_configure(config: pytest.Config): + os.environ["OTEL_SEMCONV_STABILITY_OPT_IN"] = "gen_ai_latest_experimental" + + +# ==================== Exporters ==================== + + +@pytest.fixture(scope="function", name="span_exporter") +def fixture_span_exporter(): + exporter = InMemorySpanExporter() + yield exporter + + +@pytest.fixture(scope="function", name="log_exporter") +def fixture_log_exporter(): + exporter = InMemoryLogExporter() + yield exporter + + +@pytest.fixture(scope="function", name="metric_reader") +def fixture_metric_reader(): + reader = InMemoryMetricReader() + yield reader + + +# ==================== Providers ==================== + + +@pytest.fixture(scope="function", name="tracer_provider") +def fixture_tracer_provider(span_exporter): + provider = TracerProvider() + provider.add_span_processor(SimpleSpanProcessor(span_exporter)) + return provider + + +@pytest.fixture(scope="function", name="logger_provider") +def fixture_logger_provider(log_exporter): + provider = LoggerProvider() + provider.add_log_record_processor(SimpleLogRecordProcessor(log_exporter)) + return provider + + +@pytest.fixture(scope="function", name="meter_provider") +def fixture_meter_provider(metric_reader): + meter_provider = MeterProvider( + metric_readers=[metric_reader], + ) + return meter_provider + + +# ==================== Instrumentation ==================== + + +@pytest.fixture(scope="function") +def instrument(tracer_provider, logger_provider, meter_provider): + instrumentor = VitaInstrumentor() + instrumentor.instrument( + tracer_provider=tracer_provider, + logger_provider=logger_provider, + meter_provider=meter_provider, + skip_dep_check=True, + ) + yield instrumentor + instrumentor.uninstrument() diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/test_instrumentor.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/test_instrumentor.py new file mode 100644 index 000000000..a6a2339f8 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/test_instrumentor.py @@ -0,0 +1,478 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for VitaBench instrumentation. + +The suite exercises all execute.md hook points. External I/O is replaced at the +HTTP/tool boundary, while the Vita agent/orchestrator call chain runs through +the real framework methods. +""" + +from __future__ import annotations + +from types import SimpleNamespace +from unittest.mock import MagicMock, patch + +import pytest + +from opentelemetry.instrumentation.vita import VitaInstrumentor + + +FAKE_MODELS_CONFIG = { + "qwen-max": { + "base_url": "http://fake-api.example.com/v1/chat/completions", + "headers": {"Authorization": "Bearer test-key"}, + }, + "gpt-4": { + "base_url": "http://fake-api.example.com/v1/chat/completions", + "headers": {"Authorization": "Bearer test-key"}, + }, + "claude-3-opus": { + "base_url": "http://fake-api.example.com/v1/chat/completions", + "headers": {"Authorization": "Bearer test-key"}, + }, +} + + +def _make_openai_response(content=None, tool_calls=None, usage=None): + message = {"role": "assistant", "content": content} + if tool_calls: + message["tool_calls"] = tool_calls + return { + "id": "chatcmpl-test", + "model": "test-model", + "choices": [{"message": message, "finish_reason": "stop"}], + "usage": usage + or {"prompt_tokens": 100, "completion_tokens": 50, "total_tokens": 150}, + } + + +def _mock_requests_post(response_dict): + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = response_dict + return mock_resp + + +def _tool_call_response(): + return _make_openai_response( + tool_calls=[ + { + "id": "call_1", + "type": "function", + "function": { + "name": "get_order", + "arguments": '{"order_id": "123"}', + }, + } + ], + usage={"prompt_tokens": 100, "completion_tokens": 20, "total_tokens": 120}, + ) + + +def _text_response(content="Order 123 has been delivered. ###STOP###"): + return _make_openai_response( + content=content, + usage={"prompt_tokens": 200, "completion_tokens": 30, "total_tokens": 230}, + ) + + +class FakeTool: + name = "get_order" + short_desc = "Get order details" + openai_schema = { + "type": "function", + "function": { + "name": "get_order", + "description": "Get order details", + "parameters": { + "type": "object", + "properties": {"order_id": {"type": "string"}}, + }, + }, + } + + +class FakeTools: + def __init__(self): + self.db = SimpleNamespace(time="2026-01-01 00:00:00") + self._tools = {"get_order": FakeTool()} + + def get_tools(self): + return self._tools + + def use_tool(self, tool_name, **kwargs): + return {"tool": tool_name, "arguments": kwargs, "status": "delivered"} + + def get_db_hash(self): + return "fake-db-hash" + + +class DeterministicUser: + def get_init_state(self, message_history=None): + return SimpleNamespace(messages=message_history or []) + + def generate_next_message(self, message, state): + from vita.data_model.message import UserMessage + + user_message = UserMessage(role="user", content="Check order 123") + state.messages.append(user_message) + return user_message, state + + +def _make_agent(): + from vita.agent.llm_agent import LLMAgent + + return LLMAgent( + tools=[FakeTool()], + domain_policy="You are helpful at {time}.", + llm="qwen-max", + llm_args={}, + time="2026-01-01 00:00:00", + language="english", + ) + + +def _make_orchestrator(): + from vita.environment.environment import Environment + from vita.orchestrator.orchestrator import Orchestrator + + return Orchestrator( + domain="delivery", + agent=_make_agent(), + user=DeterministicUser(), + environment=Environment(domain_name="delivery", tools=FakeTools()), + task=SimpleNamespace( + id="task_001", + instructions="Check order 123", + message_history=None, + ), + max_steps=6, + max_errors=3, + language="english", + ) + + +def _span_attrs(spans, name): + span = next(s for s in spans if s.name == name) + return dict(span.attributes) + + +class TestVitaInstrumentor: + def test_instrument_and_uninstrument( + self, tracer_provider, logger_provider, meter_provider + ): + instrumentor = VitaInstrumentor() + instrumentor.instrument( + tracer_provider=tracer_provider, + logger_provider=logger_provider, + meter_provider=meter_provider, + skip_dep_check=True, + ) + assert instrumentor._handler is not None + instrumentor.uninstrument() + assert instrumentor._handler is None + + def test_instrumentation_dependencies(self): + assert VitaInstrumentor().instrumentation_dependencies() == ( + "vita >= 0.0.1", + ) + + +class TestLLMSpan: + def test_llm_span_text_response(self, instrument, span_exporter): + from vita.data_model.message import UserMessage + from vita.utils.llm_utils import generate + + with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch( + "requests.post", + return_value=_mock_requests_post( + _make_openai_response( + content="The order has been delivered.", + usage={ + "prompt_tokens": 150, + "completion_tokens": 30, + "total_tokens": 180, + }, + ) + ), + ): + result = generate( + model="qwen-max", + messages=[UserMessage(role="user", content="Where is my order?")], + ) + + assert result.content == "The order has been delivered." + spans = span_exporter.get_finished_spans() + attrs = _span_attrs(spans, "chat qwen-max") + assert attrs["gen_ai.operation.name"] == "chat" + assert attrs["gen_ai.span.kind"] == "LLM" + assert attrs["gen_ai.request.model"] == "qwen-max" + assert attrs["gen_ai.provider.name"] == "alibaba_cloud" + assert attrs["gen_ai.usage.input_tokens"] == 150 + assert attrs["gen_ai.usage.output_tokens"] == 30 + assert attrs["gen_ai.response.finish_reasons"] == ("stop",) + + def test_llm_span_tool_call_response(self, instrument, span_exporter): + from vita.data_model.message import UserMessage + from vita.utils.llm_utils import generate + + with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch( + "requests.post", return_value=_mock_requests_post(_tool_call_response()) + ): + result = generate( + model="gpt-4", + messages=[UserMessage(role="user", content="Check my order")], + ) + + assert result.tool_calls is not None + attrs = _span_attrs(span_exporter.get_finished_spans(), "chat gpt-4") + assert attrs["gen_ai.response.finish_reasons"] == ("tool_calls",) + assert attrs["gen_ai.provider.name"] == "openai" + + def test_llm_span_captures_positional_tools(self, instrument, span_exporter): + from vita.data_model.message import UserMessage + from vita.utils.llm_utils import generate + + with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch( + "requests.post", return_value=_mock_requests_post(_text_response("Done.")) + ): + generate( + "qwen-max", + [UserMessage(role="user", content="Check my order")], + [FakeTool()], + ) + + attrs = _span_attrs(span_exporter.get_finished_spans(), "chat qwen-max") + assert "gen_ai.tool.definitions" in attrs + assert "get_order" in attrs["gen_ai.tool.definitions"] + + +class TestToolSpan: + def test_tool_span_created(self, instrument, span_exporter): + from vita.data_model.message import ToolCall + from vita.environment.environment import Environment + + env = Environment(domain_name="delivery", tools=FakeTools()) + result = env.get_response( + ToolCall(id="tc_42", name="get_order", arguments={"order_id": "999"}) + ) + + assert result.content is not None + attrs = _span_attrs( + span_exporter.get_finished_spans(), "execute_tool get_order" + ) + assert attrs["gen_ai.operation.name"] == "execute_tool" + assert attrs["gen_ai.span.kind"] == "TOOL" + assert attrs["gen_ai.tool.name"] == "get_order" + assert attrs["gen_ai.tool.call.id"] == "tc_42" + + def test_tool_span_on_error(self, instrument, span_exporter): + from vita.data_model.message import ToolCall + from vita.environment.environment import Environment + + tools = FakeTools() + tools.use_tool = MagicMock(side_effect=RuntimeError("Tool failed")) + env = Environment(domain_name="delivery", tools=tools) + result = env.get_response( + ToolCall(id="tc_err", name="get_order", arguments={}) + ) + + assert result.error is True + tool_span = next( + s + for s in span_exporter.get_finished_spans() + if s.name == "execute_tool get_order" + ) + assert tool_span.status.status_code.name == "ERROR" + + +class TestAgentSpan: + def test_agent_span_created_for_llm_agent(self, instrument, span_exporter): + from vita.data_model.message import UserMessage + + agent = _make_agent() + state = agent.get_init_state([]) + + with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch( + "requests.post", return_value=_mock_requests_post(_text_response("Sure.")) + ): + assistant_msg, _ = agent.generate_next_message( + UserMessage(role="user", content="I need help"), state + ) + + assert assistant_msg.content == "Sure." + spans = span_exporter.get_finished_spans() + agent_span = next(s for s in spans if s.name == "invoke_agent LLMAgent") + llm_span = next(s for s in spans if s.name == "chat qwen-max") + attrs = dict(agent_span.attributes) + assert attrs["gen_ai.operation.name"] == "invoke_agent" + assert attrs["gen_ai.span.kind"] == "AGENT" + assert attrs["gen_ai.agent.name"] == "LLMAgent" + assert attrs["gen_ai.request.model"] == "qwen-max" + assert llm_span.parent.span_id == agent_span.context.span_id + + def test_agent_span_created_for_llm_solo_agent(self, instrument, span_exporter): + from vita.agent.llm_agent import LLMSoloAgent + + agent = LLMSoloAgent( + tools=[FakeTool()], + domain_policy="unused", + llm="qwen-max", + llm_args={}, + time="2026-01-01 00:00:00", + language="english", + ) + state = agent.get_init_state([]) + + with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch( + "requests.post", return_value=_mock_requests_post(_tool_call_response()) + ): + agent.generate_next_message(None, state) + + attrs = _span_attrs( + span_exporter.get_finished_spans(), "invoke_agent LLMSoloAgent" + ) + assert attrs["gen_ai.span.kind"] == "AGENT" + assert attrs["gen_ai.agent.name"] == "LLMSoloAgent" + + +class TestStepAndChainSpans: + def test_orchestrator_run_creates_chain_steps_agents_llms_and_tools( + self, instrument, span_exporter + ): + responses = [ + _mock_requests_post(_tool_call_response()), + _mock_requests_post(_text_response()), + ] + + with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch( + "requests.post", side_effect=responses + ): + result = _make_orchestrator().run() + + assert result.termination_reason == "agent_stop" + spans = span_exporter.get_finished_spans() + chain = next(s for s in spans if s.name == "workflow delivery") + steps = sorted( + [s for s in spans if s.name == "react step"], key=lambda s: s.start_time + ) + agents = sorted( + [s for s in spans if s.name == "invoke_agent LLMAgent"], + key=lambda s: s.start_time, + ) + llms = sorted( + [s for s in spans if s.name == "chat qwen-max"], + key=lambda s: s.start_time, + ) + tools = [s for s in spans if s.name == "execute_tool get_order"] + + assert len(steps) == 2 + assert len(agents) == 2 + assert len(llms) == 2 + assert len(tools) == 1 + + chain_attrs = dict(chain.attributes) + assert chain_attrs["gen_ai.operation.name"] == "workflow" + assert chain_attrs["gen_ai.span.kind"] == "CHAIN" + assert chain_attrs["gen_ai.framework"] == "vitabench" + + assert dict(steps[0].attributes)["gen_ai.react.round"] == 1 + assert dict(steps[1].attributes)["gen_ai.react.round"] == 2 + for step in steps: + assert step.parent.span_id == chain.context.span_id + assert agents[0].parent.span_id == steps[0].context.span_id + assert agents[1].parent.span_id == steps[1].context.span_id + assert llms[0].parent.span_id == agents[0].context.span_id + assert llms[1].parent.span_id == agents[1].context.span_id + assert tools[0].parent.span_id == steps[0].context.span_id + + def test_open_step_fails_when_env_turn_raises(self, instrument, span_exporter): + with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch( + "requests.post", return_value=_mock_requests_post(_tool_call_response()) + ), patch( + "vita.environment.environment.Environment.get_response", + side_effect=RuntimeError("env broke"), + ): + with pytest.raises(RuntimeError, match="env broke"): + _make_orchestrator().run() + + spans = span_exporter.get_finished_spans() + step = next(s for s in spans if s.name == "react step") + chain = next(s for s in spans if s.name == "workflow delivery") + step_attrs = dict(step.attributes) + assert step.status.status_code.name == "ERROR" + assert step_attrs["gen_ai.react.finish_reason"] == "error" + assert chain.status.status_code.name == "ERROR" + + +class TestEntrySpan: + def test_run_task_entry_wraps_orchestrator_trace(self, instrument, span_exporter): + from vita.run import run_task + + def fake_internal(**kwargs): + return _make_orchestrator().run() + + responses = [ + _mock_requests_post(_tool_call_response()), + _mock_requests_post(_text_response()), + ] + task = SimpleNamespace( + id="task_001", + instructions="Check order 123", + message_history=None, + ) + + with patch("vita.run._run_task_internal", side_effect=fake_internal), patch( + "vita.utils.llm_utils.models", FAKE_MODELS_CONFIG + ), patch("requests.post", side_effect=responses): + result = run_task("delivery", task, "llm_agent", "user_simulator") + + assert result.termination_reason == "agent_stop" + spans = span_exporter.get_finished_spans() + entry = next(s for s in spans if s.name == "enter_ai_application_system") + chain = next(s for s in spans if s.name == "workflow delivery") + attrs = dict(entry.attributes) + assert attrs["gen_ai.operation.name"] == "enter" + assert attrs["gen_ai.span.kind"] == "ENTRY" + assert attrs["gen_ai.framework"] == "vitabench" + assert "gen_ai.session.id" in attrs + assert chain.parent.span_id == entry.context.span_id + + +class TestProviderInference: + def test_common_provider_names(self, instrument, span_exporter): + from vita.data_model.message import UserMessage + from vita.utils.llm_utils import generate + + for model in ("gpt-4", "claude-3-opus", "qwen-max"): + with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch( + "requests.post", + return_value=_mock_requests_post(_make_openai_response(content="Hi")), + ): + generate( + model=model, + messages=[UserMessage(role="user", content="Hi")], + ) + + providers = { + dict(s.attributes)["gen_ai.request.model"]: dict(s.attributes)[ + "gen_ai.provider.name" + ] + for s in span_exporter.get_finished_spans() + if s.name.startswith("chat ") + } + assert providers["gpt-4"] == "openai" + assert providers["claude-3-opus"] == "anthropic" + assert providers["qwen-max"] == "alibaba_cloud" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/README.md b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/README.md new file mode 100644 index 000000000..4b4aac443 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/README.md @@ -0,0 +1,17 @@ +# LoongSuite WideSearch Instrumentation + +OpenTelemetry instrumentation for the [WideSearch](https://github.com/ByteDance-Seed/WideSearch) multi-agent search framework. + +## Installation + +```bash +pip install loongsuite-instrumentation-widesearch +``` + +## Usage + +```python +from opentelemetry.instrumentation.widesearch import WideSearchInstrumentor + +WideSearchInstrumentor().instrument() +``` diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/pyproject.toml new file mode 100644 index 000000000..9a819d25a --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/pyproject.toml @@ -0,0 +1,57 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "loongsuite-instrumentation-widesearch" +dynamic = ["version"] +description = "LoongSuite WideSearch Instrumentation" +readme = "README.md" +license = "Apache-2.0" +requires-python = ">=3.11" +authors = [ + { name = "LoongSuite Python Agent Authors", email = "caishipeng.csp@alibaba-inc.com" }, + { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] +dependencies = [ + "opentelemetry-api ~= 1.37", + "opentelemetry-instrumentation >= 0.58b0", + "opentelemetry-semantic-conventions >= 0.58b0", + "opentelemetry-util-genai", + "wrapt >= 1.17.3, < 2.0.0", +] + +[project.optional-dependencies] +instruments = [ + "widesearch >= 0.1.0", +] +test = [ + "pytest ~= 8.0", + "pytest-cov ~= 4.1.0", +] + +[project.entry-points.opentelemetry_instrumentor] +widesearch = "opentelemetry.instrumentation.widesearch:WideSearchInstrumentor" + +[project.urls] +Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-widesearch" +Repository = "https://github.com/alibaba/loongsuite-python-agent" + +[tool.hatch.version] +path = "src/opentelemetry/instrumentation/widesearch/version.py" + +[tool.hatch.build.targets.sdist] +include = ["/src", "/tests"] + +[tool.hatch.build.targets.wheel] +packages = ["src/opentelemetry"] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/__init__.py new file mode 100644 index 000000000..9c441d18f --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/__init__.py @@ -0,0 +1,164 @@ +""" +WideSearch instrumentation supporting `widesearch >= 0.1.0`. + +Usage +----- +.. code:: python + + from opentelemetry.instrumentation.widesearch import WideSearchInstrumentor + + WideSearchInstrumentor().instrument() + +API +--- +""" + +from __future__ import annotations + +import logging +from typing import Any, Collection + +from wrapt import wrap_function_wrapper + +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor +from opentelemetry.instrumentation.utils import unwrap +from opentelemetry.instrumentation.widesearch.package import _instruments +from opentelemetry.instrumentation.widesearch.patch import ( + wrap_create_sub_agents_factory, + wrap_invoke_tool_call, + wrap_run_single_query, + wrap_runner_run, + wrap_runner_step, +) +from opentelemetry.instrumentation.widesearch.version import __version__ +from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler + +logger = logging.getLogger(__name__) + +_RUN_MODULE = "src.agent.run" +_MULTI_AGENT_MODULE = "src.agent.multi_agent_tools" + +__all__ = ["WideSearchInstrumentor", "__version__"] + + +class WideSearchInstrumentor(BaseInstrumentor): + """OpenTelemetry instrumentor for WideSearch framework. + + Instruments the following components: + - run_single_query(): ENTRY span + - Runner.run(): AGENT span (async generator) + - Runner._step(): STEP span + - Runner._invoke_tool_call(): TOOL spans + - create_sub_agents_wrap(): TASK span + """ + + def __init__(self): + super().__init__() + self._handler = None + + def instrumentation_dependencies(self) -> Collection[str]: + return _instruments + + def _instrument(self, **kwargs: Any) -> None: + tracer_provider = kwargs.get("tracer_provider") + meter_provider = kwargs.get("meter_provider") + logger_provider = kwargs.get("logger_provider") + + self._handler = ExtendedTelemetryHandler( + tracer_provider=tracer_provider, + meter_provider=meter_provider, + logger_provider=logger_provider, + ) + + # H1: ENTRY span + try: + wrap_function_wrapper( + module=_RUN_MODULE, + name="run_single_query", + wrapper=lambda w, i, a, k: wrap_run_single_query( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented run_single_query") + except Exception as e: + logger.warning(f"Failed to instrument run_single_query: {e}") + + # H2: AGENT span + try: + wrap_function_wrapper( + module=_RUN_MODULE, + name="Runner.run", + wrapper=lambda w, i, a, k: wrap_runner_run( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented Runner.run") + except Exception as e: + logger.warning(f"Failed to instrument Runner.run: {e}") + + # H3: STEP span + try: + wrap_function_wrapper( + module=_RUN_MODULE, + name="Runner._step", + wrapper=lambda w, i, a, k: wrap_runner_step( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented Runner._step") + except Exception as e: + logger.warning(f"Failed to instrument Runner._step: {e}") + + # H4: TOOL spans + try: + wrap_function_wrapper( + module=_RUN_MODULE, + name="Runner._invoke_tool_call", + wrapper=lambda w, i, a, k: wrap_invoke_tool_call( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented Runner._invoke_tool_call") + except Exception as e: + logger.warning( + f"Failed to instrument Runner._invoke_tool_call: {e}" + ) + + # H5: TASK span (wrap factory) + try: + wrap_function_wrapper( + module=_MULTI_AGENT_MODULE, + name="create_sub_agents_wrap", + wrapper=lambda w, i, a, k: wrap_create_sub_agents_factory( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented create_sub_agents_wrap") + except Exception as e: + logger.warning( + f"Failed to instrument create_sub_agents_wrap: {e}" + ) + + def _uninstrument(self, **kwargs: Any) -> None: + try: + import src.agent.run # noqa: PLC0415 + + unwrap(src.agent.run, "run_single_query") + unwrap(src.agent.run.Runner, "run") + unwrap(src.agent.run.Runner, "_step") + unwrap(src.agent.run.Runner, "_invoke_tool_call") + logger.debug("Uninstrumented src.agent.run") + except Exception as e: + logger.warning(f"Failed to uninstrument src.agent.run: {e}") + + try: + import src.agent.multi_agent_tools # noqa: PLC0415 + + unwrap(src.agent.multi_agent_tools, "create_sub_agents_wrap") + logger.debug("Uninstrumented src.agent.multi_agent_tools") + except Exception as e: + logger.warning( + f"Failed to uninstrument src.agent.multi_agent_tools: {e}" + ) + + self._handler = None diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/package.py new file mode 100644 index 000000000..bd0572292 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/package.py @@ -0,0 +1,2 @@ +_instruments = ("widesearch >= 0.1.0",) +_supports_metrics = False diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/patch.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/patch.py new file mode 100644 index 000000000..32ac6287b --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/patch.py @@ -0,0 +1,338 @@ +"""Patch functions for WideSearch instrumentation. + +Wraps key WideSearch methods to generate OpenTelemetry spans: +- run_single_query -> ENTRY span +- Runner.run -> AGENT span (async generator) +- Runner._step -> STEP span +- Runner._invoke_tool_call -> TOOL spans (one per tool_call) +- create_sub_agents_wrap -> TASK span (on returned closure) +""" + +from __future__ import annotations + +import asyncio +import json +import logging +from contextvars import ContextVar + +from opentelemetry.trace import SpanKind, StatusCode +from opentelemetry.trace.status import Status +from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler +from opentelemetry.util.genai.extended_types import ReactStepInvocation +from opentelemetry.util.genai.types import Error + +from .utils import ( + _create_agent_invocation, + _create_entry_invocation, + _create_tool_invocation, + _extract_output_messages, + _step_to_output_messages, +) + +logger = logging.getLogger(__name__) + +_step_counter: ContextVar[int] = ContextVar("ws_step_counter", default=0) +_in_run_single_query: ContextVar[bool] = ContextVar("ws_in_rsq", default=False) + + +async def wrap_run_single_query( + wrapped, instance, args, kwargs, *, handler: ExtendedTelemetryHandler +): + """H1: ENTRY span for run_single_query.""" + if _in_run_single_query.get(): + return await wrapped(*args, **kwargs) + token = _in_run_single_query.set(True) + + query = args[0] if args else kwargs.get("query", "") + try: + invocation = _create_entry_invocation(query) + except Exception as e: + logger.debug(f"Failed to create entry invocation: {e}") + _in_run_single_query.reset(token) + return await wrapped(*args, **kwargs) + + handler.start_entry(invocation) + + try: + result = await wrapped(*args, **kwargs) + invocation.output_messages = _extract_output_messages(result) + handler.stop_entry(invocation) + return result + except Exception as e: + handler.fail_entry(invocation, Error(message=str(e), type=type(e))) + raise + finally: + _in_run_single_query.reset(token) + + +async def wrap_runner_run( + wrapped, instance, args, kwargs, *, handler: ExtendedTelemetryHandler +): + """H2: AGENT span for Runner.run (async generator).""" + starting_agent = args[0] if args else kwargs.get("starting_agent") + user_input = args[1] if len(args) > 1 else kwargs.get("user_input", "") + + try: + invocation = _create_agent_invocation(starting_agent, user_input) + except Exception as e: + logger.debug(f"Failed to create agent invocation: {e}") + async for step in wrapped(*args, **kwargs): + yield step + return + + counter_token = _step_counter.set(0) + handler.start_invoke_agent(invocation) + + try: + last_step = None + async for step in wrapped(*args, **kwargs): + last_step = step + yield step + + if last_step: + invocation.output_messages = _step_to_output_messages(last_step) + handler.stop_invoke_agent(invocation) + except GeneratorExit as e: + handler.fail_invoke_agent( + invocation, Error(message="GeneratorExit", type=GeneratorExit) + ) + raise + except Exception as e: + handler.fail_invoke_agent( + invocation, Error(message=str(e), type=type(e)) + ) + raise + finally: + _step_counter.reset(counter_token) + + +async def wrap_runner_step( + wrapped, instance, args, kwargs, *, handler: ExtendedTelemetryHandler +): + """H3: STEP span for Runner._step.""" + step_num = _step_counter.get() + 1 + _step_counter.set(step_num) + + invocation = ReactStepInvocation(round=step_num) + invocation.attributes["gen_ai.framework"] = "widesearch" + + try: + handler.start_react_step(invocation) + except Exception as e: + logger.debug(f"Failed to start react step: {e}") + return await wrapped(*args, **kwargs) + + try: + result = await wrapped(*args, **kwargs) + + from src.agent.memory import ActionStep, ActionStepError, StepStatus + + if isinstance(result, ActionStepError): + invocation.finish_reason = "error" + handler.fail_react_step( + invocation, + Error(message=result.message, type=type(result)), + ) + else: + if result.step_status == StepStatus.FINISHED: + invocation.finish_reason = "finished" + elif result.error_marker is not None: + invocation.finish_reason = "error" + else: + invocation.finish_reason = "continue" + handler.stop_react_step(invocation) + + return result + except Exception as e: + invocation.finish_reason = "error" + handler.fail_react_step( + invocation, Error(message=str(e), type=type(e)) + ) + raise + + +async def wrap_invoke_tool_call( + wrapped, instance, args, kwargs, *, handler: ExtendedTelemetryHandler +): + """H4: TOOL span for each tool_call inside Runner._invoke_tool_call.""" + agent = args[0] if args else kwargs.get("agent") + model_response = args[1] if len(args) > 1 else kwargs.get("model_response") + + if not model_response.outputs: + return await wrapped(*args, **kwargs) + + resp = model_response.outputs[0] + if not resp.tool_calls: + return await wrapped(*args, **kwargs) + + from src.agent.schema import ErrorMarker, ToolCallResult + + async def _call_with_span(tool_call): + try: + invocation = _create_tool_invocation(tool_call, agent) + except Exception as e: + logger.debug(f"Failed to create tool invocation: {e}") + return await _call_original(tool_call, agent) + + handler.start_execute_tool(invocation) + + tool_name = tool_call.tool_name + tool = agent.get_tool_by_name(tool_name) + if tool is None: + invocation.tool_call_result = f"Tool {tool_name} not found" + handler.fail_execute_tool( + invocation, + Error( + message=f"Tool {tool_name} not found", + type=ValueError, + ), + ) + return ToolCallResult( + tool_call_id=tool_call.tool_call_id, + error_marker=ErrorMarker(message=f"Tool {tool_name} not found"), + ) + + arguments = tool_call.arguments + if isinstance(arguments, str): + try: + arguments = json.loads(arguments) + except json.JSONDecodeError: + arguments = {} + + try: + response = await tool(**arguments) + except Exception as e: + invocation.tool_call_result = str(e) + handler.fail_execute_tool( + invocation, Error(message=str(e), type=type(e)) + ) + return ToolCallResult( + tool_call_id=tool_call.tool_call_id, + error_marker=ErrorMarker(message=str(e)), + ) + + error_marker = ( + ErrorMarker(message=response.error) if response.error else None + ) + system_error_marker = ( + ErrorMarker(message=response.system_error) + if response.system_error + else None + ) + + result_content = response.data + invocation.tool_call_result = ( + str(result_content) if result_content else None + ) + + if error_marker or system_error_marker: + msg = (error_marker or system_error_marker)["message"] + handler.fail_execute_tool( + invocation, Error(message=msg, type=RuntimeError) + ) + else: + handler.stop_execute_tool(invocation) + + return ToolCallResult( + tool_call_id=tool_call.tool_call_id, + content=result_content, + error_marker=error_marker, + system_error_marker=system_error_marker, + extra=response.extra if response.extra else {}, + ) + + async def _call_original(tool_call, agent): + """Fallback: execute tool without span.""" + tool_name = tool_call.tool_name + tool = agent.get_tool_by_name(tool_name) + if tool is None: + return ToolCallResult( + tool_call_id=tool_call.tool_call_id, + error_marker=ErrorMarker(message=f"Tool {tool_name} not found"), + ) + arguments = tool_call.arguments + if isinstance(arguments, str): + try: + arguments = json.loads(arguments) + except json.JSONDecodeError: + arguments = {} + try: + response = await tool(**arguments) + except Exception as e: + return ToolCallResult( + tool_call_id=tool_call.tool_call_id, + error_marker=ErrorMarker(message=str(e)), + ) + return ToolCallResult( + tool_call_id=tool_call.tool_call_id, + content=response.data, + error_marker=( + ErrorMarker(message=response.error) if response.error else None + ), + system_error_marker=( + ErrorMarker(message=response.system_error) + if response.system_error + else None + ), + extra=response.extra if response.extra else {}, + ) + + tasks = [_call_with_span(tc) for tc in resp.tool_calls] + results = await asyncio.gather(*tasks) + return [r for r in results if r is not None] + + +def wrap_create_sub_agents_factory( + wrapped, instance, args, kwargs, *, handler: ExtendedTelemetryHandler +): + """H5: TASK span wrapping the closure returned by create_sub_agents_wrap.""" + original_closure = wrapped(*args, **kwargs) + + async def closure_with_task_span(sub_agents): + tracer = handler._tracer + span_name = "run_task create_sub_agents" + + with tracer.start_as_current_span( + name=span_name, + kind=SpanKind.INTERNAL, + ) as span: + span.set_attribute("gen_ai.span.kind", "TASK") + span.set_attribute("gen_ai.operation.name", "run_task") + span.set_attribute("gen_ai.framework", "widesearch") + + try: + safe_input = json.dumps( + [ + { + "index": sa.get("index"), + "prompt": sa.get("prompt", "")[:200], + } + for sa in sub_agents + ], + ensure_ascii=False, + ) + span.set_attribute("input.value", safe_input) + except Exception: + pass + + try: + result = await original_closure(sub_agents) + + if result and hasattr(result, "data") and result.data: + output_str = ( + result.data + if isinstance(result.data, str) + else json.dumps(result.data, ensure_ascii=False) + ) + if len(output_str) > 4096: + output_str = output_str[:4096] + "...(truncated)" + span.set_attribute("output.value", output_str) + + span.set_status(Status(StatusCode.OK)) + return result + except Exception as e: + span.record_exception(e) + span.set_status(Status(StatusCode.ERROR, str(e))) + raise + + return closure_with_task_span diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/utils.py new file mode 100644 index 000000000..0a8f751f7 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/utils.py @@ -0,0 +1,155 @@ +"""Utility functions for WideSearch instrumentation.""" + +from __future__ import annotations + +import json +import logging +from typing import Any, List, Optional + +from opentelemetry.util.genai.extended_types import ( + EntryInvocation, + ExecuteToolInvocation, + InvokeAgentInvocation, + ReactStepInvocation, +) +from opentelemetry.util.genai.types import ( + FunctionToolDefinition, + InputMessage, + OutputMessage, + Text, +) + +logger = logging.getLogger(__name__) + + +_FRAMEWORK = "widesearch" + + +def _create_entry_invocation(query: str) -> EntryInvocation: + invocation = EntryInvocation() + invocation.input_messages = [ + InputMessage(role="user", parts=[Text(content=query)]) + ] + invocation.attributes["gen_ai.framework"] = _FRAMEWORK + return invocation + + +def _create_agent_invocation( + agent: Any, user_input: str +) -> InvokeAgentInvocation: + agent_name = getattr(agent, "name", None) or "widesearch-agent" + + request_model = None + model_config_name = getattr(agent, "model_config_name", None) + if model_config_name: + try: + from src.utils.config import model_config + + request_model = model_config.get(model_config_name, {}).get( + "model_name" + ) + except Exception: + pass + request_model = request_model or model_config_name + + instructions = getattr(agent, "instructions", None) or "" + + invocation = InvokeAgentInvocation( + provider="widesearch", + agent_name=agent_name, + agent_description=instructions[:200] if instructions else "", + request_model=request_model, + input_messages=[ + InputMessage(role="user", parts=[Text(content=user_input)]) + ], + ) + invocation.attributes["gen_ai.framework"] = _FRAMEWORK + + if instructions: + invocation.system_instruction = [Text(content=instructions)] + + tools_desc = getattr(agent, "tools_desc", None) + if tools_desc: + invocation.tool_definitions = _convert_tools_desc(tools_desc) + + return invocation + + +def _create_tool_invocation( + tool_call: Any, agent: Any +) -> ExecuteToolInvocation: + args = tool_call.arguments + if isinstance(args, str): + try: + args = json.loads(args) + except (json.JSONDecodeError, ValueError): + args = {"raw": args} + + description = None + if hasattr(agent, "tools_desc"): + for td in agent.tools_desc: + func = td.get("function", {}) + if func.get("name") == tool_call.tool_name: + description = func.get("description") + break + + invocation = ExecuteToolInvocation( + tool_name=tool_call.tool_name, + tool_call_id=getattr(tool_call, "tool_call_id", None), + tool_call_arguments=args, + tool_description=description, + tool_type="function", + ) + invocation.attributes["gen_ai.framework"] = _FRAMEWORK + return invocation + + +def _extract_output_messages(messages: Any) -> List[OutputMessage]: + """Extract output messages from run_single_query return value.""" + if not messages: + return [] + last_msg = messages[-1] + content = "" + if isinstance(last_msg, dict): + c = last_msg.get("content", {}) + if isinstance(c, dict): + content = c.get("content", "") + elif isinstance(c, str): + content = c + return [ + OutputMessage( + role="assistant", + parts=[Text(content=content)], + finish_reason="stop", + ) + ] + + +def _step_to_output_messages(step: Any) -> List[OutputMessage]: + """Extract output messages from an ActionStep.""" + content = getattr(step, "content", None) or "" + return [ + OutputMessage( + role="assistant", + parts=[Text(content=content)], + finish_reason="stop", + ) + ] + + +def _convert_tools_desc( + tools_desc: List[dict], +) -> Optional[List[FunctionToolDefinition]]: + """Convert WideSearch tools_desc to FunctionToolDefinition list.""" + result = [] + for td in tools_desc: + if td.get("type") == "function": + func = td.get("function", {}) + result.append( + FunctionToolDefinition( + name=func.get("name", ""), + description=func.get("description"), + parameters=func.get("parameters"), + ) + ) + return result if result else None diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/version.py new file mode 100644 index 000000000..26056b5d8 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/version.py @@ -0,0 +1 @@ +__version__ = "0.5.0.dev" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/conftest.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/conftest.py new file mode 100644 index 000000000..fa827987c --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/conftest.py @@ -0,0 +1,386 @@ +"""Test configuration for WideSearch instrumentation tests. + +Injects lightweight stub modules for `src.agent.*` into sys.modules +so that wrap_function_wrapper can find them without installing WideSearch. +""" + +from __future__ import annotations + +import os +import sys +import types +from dataclasses import dataclass, field +from enum import Enum +from typing import Any, Callable, List, Literal + +import pytest + +# --------------------------------------------------------------------------- +# Stub modules for WideSearch (src.agent.*) +# --------------------------------------------------------------------------- + + +class StepStatus(str, Enum): + USER = "USER" + FINISHED = "FINISHED" + CONTINUE = "CONTINUE" + ERROR = "ERROR" + + +@dataclass +class ActionStepError: + message: str + source: Literal["llm"] = "llm" + + +@dataclass +class ToolCall: + tool_name: str + arguments: Any + tool_call_id: str + + +@dataclass +class ErrorMarker: + message: str + + def __getitem__(self, key): + if key == "message": + return self.message + raise KeyError(key) + + +@dataclass +class ToolCallResult: + tool_call_id: str + content: str | None = None + error_marker: Any = None + system_error_marker: Any = None + extra: dict = field(default_factory=dict) + + +@dataclass +class LLMOutputItem: + role: str = "assistant" + content: str | None = None + reasoning_content: str | None = None + signature: str | None = None + tool_calls: list = field(default_factory=list) + + +@dataclass +class ModelResponse: + outputs: list = field(default_factory=list) + session_id: str | None = None + error_marker: Any = None + + +@dataclass +class ActionStep: + step_status: StepStatus = StepStatus.CONTINUE + content: str | None = None + reasoning_content: str | None = None + signature: str | None = None + tool_calls: list = field(default_factory=list) + tool_call_results: list = field(default_factory=list) + error_marker: Any = None + + +@dataclass +class UserInputStep: + user_input: str + step_status: StepStatus = StepStatus.USER + + +@dataclass +class MemoryTurn: + steps: list = field(default_factory=list) + + @property + def step_number(self): + return sum(1 for s in self.steps if isinstance(s, ActionStep)) + + def is_finished(self) -> bool: + if not self.steps: + return False + return self.steps[-1].step_status == StepStatus.FINISHED + + +@dataclass +class MemoryAgent: + system_instructions: str | None = None + turns: list = field(default_factory=list) + + def insert_user_input(self, user_input: str): + turn = MemoryTurn() + turn.steps.append(UserInputStep(user_input=user_input)) + self.turns.append(turn) + return turn + + def insert_action_step(self, action_step): + last_turn = self.turns[-1] + last_turn.steps.append(action_step) + return last_turn + + def to_message(self, **kwargs): + return [] + + +@dataclass +class InternalResponse: + data: Any = None + error: str | None = None + system_error: str | None = None + extra: dict | None = None + + +@dataclass +class Agent: + name: str = "test-agent" + instructions: str | None = "You are a helpful agent." + tools: dict = field(default_factory=dict) + tools_desc: list = field(default_factory=list) + model_config_name: str = "gpt-4o" + + def get_tool_by_name(self, tool_name: str): + return self.tools.get(tool_name) + + +DEFAULT_MAX_STEPS = 50 +DEFAULT_MAX_ERROR_COUNT = 3 + + +class Runner: + _step_override = None # Set to a callable to override _step behavior + + @classmethod + async def run( + cls, + starting_agent, + user_input: str, + memory=None, + *, + max_steps: int = DEFAULT_MAX_STEPS, + llm_error_strategy: str = "retry", + ): + if memory is None: + memory = MemoryAgent( + system_instructions=starting_agent.instructions + ) + last_turn = memory.insert_user_input(user_input) + step_result = await cls._step(agent=starting_agent, memory=memory) + if not isinstance(step_result, ActionStepError): + yield step_result + + @classmethod + async def _step(cls, *, agent, memory) -> ActionStep | ActionStepError: + if cls._step_override is not None: + return await cls._step_override(agent=agent, memory=memory) + return ActionStep(step_status=StepStatus.FINISHED, content="Done") + + @classmethod + async def _invoke_tool_call( + cls, agent, model_response + ) -> list: + return [] + + +async def run_single_query( + query: str, + agent_name: str = "", + model_config_name: str = "", + tools: dict = None, + tools_desc: list = None, + system_prompt: str = "", +): + agent = Agent( + name=agent_name, + tools=tools or {}, + tools_desc=tools_desc or [], + model_config_name=model_config_name, + ) + memory = MemoryAgent(system_instructions=system_prompt) + + # Mirrors real implementation: calls Runner.run as async generator + async for step in Runner.run(agent, query, memory): + pass + + last_content = "final answer" + if memory.turns: + last_turn = memory.turns[-1] + for s in reversed(last_turn.steps): + if isinstance(s, ActionStep) and s.content: + last_content = s.content + break + + return [ + {"role": "user", "content": query}, + {"role": "assistant", "content": {"content": last_content}}, + ] + + +def _default_tools(): + return {} + + +def get_system_prompt(language="zh"): + return "You are a helpful assistant." + + +def create_sub_agents_wrap( + agent_name, model_config_name, tools, tools_desc, system_prompt +): + async def create_sub_agents(sub_agents: list) -> InternalResponse: + import json + + results = [] + for sa in sub_agents: + results.append( + {"index": sa.get("index"), "prompt": sa.get("prompt", ""), "response": "sub result"} + ) + return InternalResponse( + data=json.dumps(results, ensure_ascii=False) + ) + + return create_sub_agents + + +def _inject_stub_modules(): + """Inject stub modules into sys.modules so that wrapt can resolve them.""" + # Create module hierarchy: src -> src.agent -> src.agent.run, etc. + src_mod = types.ModuleType("src") + src_agent_mod = types.ModuleType("src.agent") + src_agent_run_mod = types.ModuleType("src.agent.run") + src_agent_multi_agent_tools_mod = types.ModuleType("src.agent.multi_agent_tools") + src_agent_memory_mod = types.ModuleType("src.agent.memory") + src_agent_schema_mod = types.ModuleType("src.agent.schema") + src_agent_tools_mod = types.ModuleType("src.agent.tools") + src_agent_prompt_mod = types.ModuleType("src.agent.prompt") + src_utils_mod = types.ModuleType("src.utils") + src_utils_config_mod = types.ModuleType("src.utils.config") + + # Populate src.agent.run + src_agent_run_mod.Runner = Runner + src_agent_run_mod.run_single_query = run_single_query + src_agent_run_mod.run_turn = None + src_agent_run_mod.extract_messages_from_memory = None + + # Populate src.agent.multi_agent_tools + src_agent_multi_agent_tools_mod.create_sub_agents_wrap = create_sub_agents_wrap + + # Populate src.agent.memory + src_agent_memory_mod.ActionStep = ActionStep + src_agent_memory_mod.ActionStepError = ActionStepError + src_agent_memory_mod.MemoryAgent = MemoryAgent + src_agent_memory_mod.StepStatus = StepStatus + src_agent_memory_mod.UserInputStep = UserInputStep + + # Populate src.agent.schema + src_agent_schema_mod.ToolCall = ToolCall + src_agent_schema_mod.ToolCallResult = ToolCallResult + src_agent_schema_mod.ModelResponse = ModelResponse + src_agent_schema_mod.ErrorMarker = ErrorMarker + src_agent_schema_mod.LLMOutputItem = LLMOutputItem + + # Populate src.agent.tools + src_agent_tools_mod.InternalResponse = InternalResponse + src_agent_tools_mod._default_tools = {} + + # Populate src.agent.prompt + src_agent_prompt_mod.get_system_prompt = get_system_prompt + + # Populate src.agent.agent + src_agent_agent_mod = types.ModuleType("src.agent.agent") + src_agent_agent_mod.Agent = Agent + src_agent_agent_mod.DEFAULT_MAX_STEPS = DEFAULT_MAX_STEPS + src_agent_agent_mod.DEFAULT_MAX_ERROR_COUNT = DEFAULT_MAX_ERROR_COUNT + + # Populate src.utils.config + src_utils_config_mod.model_config = { + "gpt-4o": {"model_name": "gpt-4o-2024-05-13"}, + } + + # Wire up parent references + src_mod.agent = src_agent_mod + src_mod.utils = src_utils_mod + src_agent_mod.run = src_agent_run_mod + src_agent_mod.multi_agent_tools = src_agent_multi_agent_tools_mod + src_agent_mod.memory = src_agent_memory_mod + src_agent_mod.schema = src_agent_schema_mod + src_agent_mod.tools = src_agent_tools_mod + src_agent_mod.prompt = src_agent_prompt_mod + src_agent_mod.agent = src_agent_agent_mod + + # Register in sys.modules + sys.modules["src"] = src_mod + sys.modules["src.agent"] = src_agent_mod + sys.modules["src.agent.run"] = src_agent_run_mod + sys.modules["src.agent.multi_agent_tools"] = src_agent_multi_agent_tools_mod + sys.modules["src.agent.memory"] = src_agent_memory_mod + sys.modules["src.agent.schema"] = src_agent_schema_mod + sys.modules["src.agent.tools"] = src_agent_tools_mod + sys.modules["src.agent.prompt"] = src_agent_prompt_mod + sys.modules["src.agent.agent"] = src_agent_agent_mod + sys.modules["src.utils"] = src_utils_mod + sys.modules["src.utils.config"] = src_utils_config_mod + + +# Inject stubs before any test imports the instrumentation module +_inject_stub_modules() + + +# --------------------------------------------------------------------------- +# OTel test fixtures +# --------------------------------------------------------------------------- + + +def pytest_configure(config: pytest.Config): + os.environ["OTEL_SEMCONV_STABILITY_OPT_IN"] = "gen_ai_latest_experimental" + os.environ["OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT"] = "span_only" + + +from opentelemetry.instrumentation.widesearch import WideSearchInstrumentor +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import SimpleSpanProcessor +from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( + InMemorySpanExporter, +) +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics.export import InMemoryMetricReader + + +@pytest.fixture(scope="function", name="span_exporter") +def fixture_span_exporter(): + exporter = InMemorySpanExporter() + yield exporter + + +@pytest.fixture(scope="function", name="metric_reader") +def fixture_metric_reader(): + reader = InMemoryMetricReader() + yield reader + + +@pytest.fixture(scope="function", name="tracer_provider") +def fixture_tracer_provider(span_exporter): + provider = TracerProvider() + provider.add_span_processor(SimpleSpanProcessor(span_exporter)) + return provider + + +@pytest.fixture(scope="function", name="meter_provider") +def fixture_meter_provider(metric_reader): + meter_provider = MeterProvider(metric_readers=[metric_reader]) + return meter_provider + + +@pytest.fixture(scope="function") +def instrument(tracer_provider, meter_provider): + instrumentor = WideSearchInstrumentor() + instrumentor.instrument( + tracer_provider=tracer_provider, + meter_provider=meter_provider, + skip_dep_check=True, + ) + yield instrumentor + instrumentor.uninstrument() diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/test_widesearch.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/test_widesearch.py new file mode 100644 index 000000000..7ddc04a2a --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/test_widesearch.py @@ -0,0 +1,715 @@ +"""Tests for WideSearch instrumentation. + +Covers: +- Instrumentor lifecycle (instrument/uninstrument idempotency) +- 5 span types: ENTRY, AGENT, STEP, TOOL, TASK +- Parent-child relationships +- Key attributes +- Error paths +""" + +from __future__ import annotations + +import asyncio +import json +import sys +from dataclasses import field +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from opentelemetry.trace import StatusCode + +from .conftest import ( + ActionStep, + ActionStepError, + Agent, + ErrorMarker, + InternalResponse, + LLMOutputItem, + MemoryAgent, + ModelResponse, + Runner, + StepStatus, + ToolCall, + ToolCallResult, +) + + +def _run_async(coro): + """Helper to run async coroutines in tests.""" + loop = asyncio.new_event_loop() + try: + return loop.run_until_complete(coro) + finally: + loop.close() + + +def _run_async_gen(async_gen): + """Helper to consume an async generator.""" + async def _consume(): + results = [] + async for item in async_gen: + results.append(item) + return results + loop = asyncio.new_event_loop() + try: + return loop.run_until_complete(_consume()) + finally: + loop.close() + + +# --------------------------------------------------------------------------- +# Instrumentor Lifecycle Tests +# --------------------------------------------------------------------------- + + +class TestInstrumentorLifecycle: + def test_instrument_and_uninstrument(self, tracer_provider, meter_provider): + from opentelemetry.instrumentation.widesearch import WideSearchInstrumentor + + instrumentor = WideSearchInstrumentor() + instrumentor.instrument( + tracer_provider=tracer_provider, + meter_provider=meter_provider, + skip_dep_check=True, + ) + assert instrumentor._handler is not None + instrumentor.uninstrument() + assert instrumentor._handler is None + + def test_double_instrument_uninstrument(self, tracer_provider, meter_provider): + from opentelemetry.instrumentation.widesearch import WideSearchInstrumentor + + instrumentor = WideSearchInstrumentor() + instrumentor.instrument( + tracer_provider=tracer_provider, + meter_provider=meter_provider, + skip_dep_check=True, + ) + instrumentor.uninstrument() + + instrumentor2 = WideSearchInstrumentor() + instrumentor2.instrument( + tracer_provider=tracer_provider, + meter_provider=meter_provider, + skip_dep_check=True, + ) + assert instrumentor2._handler is not None + instrumentor2.uninstrument() + + def test_instrumentation_dependencies(self): + from opentelemetry.instrumentation.widesearch import WideSearchInstrumentor + + instrumentor = WideSearchInstrumentor() + deps = instrumentor.instrumentation_dependencies() + assert ("widesearch >= 0.1.0",) == deps + + +# --------------------------------------------------------------------------- +# ENTRY Span Tests (H1: run_single_query) +# --------------------------------------------------------------------------- + + +class TestEntrySpan: + def test_entry_span_created(self, span_exporter, instrument): + """run_single_query should produce an ENTRY span.""" + from src.agent.run import run_single_query + + _run_async(run_single_query("What is AI?", agent_name="searcher")) + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans if s.name == "enter_ai_application_system" + ] + assert len(entry_spans) == 1 + + entry = entry_spans[0] + attrs = dict(entry.attributes) + assert attrs.get("gen_ai.span.kind") == "ENTRY" + assert attrs.get("gen_ai.operation.name") == "enter" + assert attrs.get("gen_ai.framework") == "widesearch" + + def test_entry_span_error(self, span_exporter, instrument): + """ENTRY span should record ERROR on exception.""" + from src.agent.run import Runner, run_single_query + + async def failing_step(*, agent, memory): + raise RuntimeError("LLM connection failed") + + Runner._step_override = failing_step + + try: + with pytest.raises(RuntimeError, match="LLM connection failed"): + _run_async(run_single_query("test")) + finally: + Runner._step_override = None + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans if s.name == "enter_ai_application_system" + ] + assert len(entry_spans) == 1 + assert entry_spans[0].status.status_code == StatusCode.ERROR + + +# --------------------------------------------------------------------------- +# AGENT Span Tests (H2: Runner.run) +# --------------------------------------------------------------------------- + + +class TestAgentSpan: + def test_agent_span_created(self, span_exporter, instrument): + """Runner.run should produce an AGENT span.""" + from src.agent.run import Runner + + agent = Agent(name="search-agent", model_config_name="gpt-4o") + + async def _run(): + results = [] + async for step in Runner.run(agent, "Hello"): + results.append(step) + return results + + _run_async(_run()) + + spans = span_exporter.get_finished_spans() + agent_spans = [ + s for s in spans if "invoke_agent" in s.name + ] + assert len(agent_spans) == 1 + + span = agent_spans[0] + attrs = dict(span.attributes) + assert attrs.get("gen_ai.span.kind") == "AGENT" + assert attrs.get("gen_ai.operation.name") == "invoke_agent" + assert attrs.get("gen_ai.agent.name") == "search-agent" + assert attrs.get("gen_ai.framework") == "widesearch" + + def test_agent_span_is_child_of_entry(self, span_exporter, instrument): + """AGENT span should be a child of ENTRY span.""" + from src.agent.run import run_single_query + + _run_async(run_single_query("test query", agent_name="test")) + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans if s.name == "enter_ai_application_system" + ] + agent_spans = [s for s in spans if "invoke_agent" in s.name] + + assert len(entry_spans) == 1 + assert len(agent_spans) == 1 + + entry = entry_spans[0] + agent = agent_spans[0] + assert agent.parent.span_id == entry.context.span_id + + def test_agent_span_error(self, span_exporter, instrument): + """AGENT span should record ERROR when _step raises.""" + from src.agent.run import Runner + + async def failing_step(*, agent, memory): + raise ValueError("Step failure") + + Runner._step_override = failing_step + agent = Agent(name="fail-agent") + + async def _run(): + async for _ in Runner.run(agent, "Hello"): + pass + + try: + with pytest.raises(ValueError): + _run_async(_run()) + finally: + Runner._step_override = None + + spans = span_exporter.get_finished_spans() + agent_spans = [s for s in spans if "invoke_agent" in s.name] + assert len(agent_spans) == 1 + assert agent_spans[0].status.status_code == StatusCode.ERROR + + +# --------------------------------------------------------------------------- +# STEP Span Tests (H3: Runner._step) +# --------------------------------------------------------------------------- + + +class TestStepSpan: + def test_step_span_created(self, span_exporter, instrument): + """Runner._step should produce a STEP span.""" + from src.agent.run import Runner + + agent = Agent(name="stepper") + + async def _run(): + async for _ in Runner.run(agent, "test"): + pass + + _run_async(_run()) + + spans = span_exporter.get_finished_spans() + step_spans = [s for s in spans if s.name == "react step"] + assert len(step_spans) >= 1 + + step = step_spans[0] + attrs = dict(step.attributes) + assert attrs.get("gen_ai.span.kind") == "STEP" + assert attrs.get("gen_ai.operation.name") == "react" + assert attrs.get("gen_ai.react.round") == 1 + + def test_step_span_is_child_of_agent(self, span_exporter, instrument): + """STEP span should be child of AGENT span.""" + from src.agent.run import Runner + + agent = Agent(name="stepper") + + async def _run(): + async for _ in Runner.run(agent, "test"): + pass + + _run_async(_run()) + + spans = span_exporter.get_finished_spans() + agent_spans = [s for s in spans if "invoke_agent" in s.name] + step_spans = [s for s in spans if s.name == "react step"] + + assert len(agent_spans) == 1 + assert len(step_spans) >= 1 + + agent_span = agent_spans[0] + step_span = step_spans[0] + assert step_span.parent.span_id == agent_span.context.span_id + + def test_step_span_finish_reason_finished(self, span_exporter, instrument): + """STEP span should have finish_reason='finished' when step finishes.""" + from src.agent.run import Runner + + agent = Agent(name="stepper") + + async def _run(): + async for _ in Runner.run(agent, "test"): + pass + + _run_async(_run()) + + spans = span_exporter.get_finished_spans() + step_spans = [s for s in spans if s.name == "react step"] + assert len(step_spans) >= 1 + attrs = dict(step_spans[0].attributes) + assert attrs.get("gen_ai.react.finish_reason") == "finished" + + def test_step_span_error_on_action_step_error( + self, span_exporter, instrument + ): + """STEP span should record ERROR when _step returns ActionStepError.""" + from src.agent.run import Runner + + async def error_step(*, agent, memory): + return ActionStepError(message="LLM timeout") + + Runner._step_override = error_step + agent = Agent(name="error-agent") + + try: + async def _run(): + async for _ in Runner.run(agent, "test"): + pass + + _run_async(_run()) + finally: + Runner._step_override = None + + spans = span_exporter.get_finished_spans() + step_spans = [s for s in spans if s.name == "react step"] + assert len(step_spans) >= 1 + assert step_spans[0].status.status_code == StatusCode.ERROR + attrs = dict(step_spans[0].attributes) + assert attrs.get("gen_ai.react.finish_reason") == "error" + + +# --------------------------------------------------------------------------- +# TOOL Span Tests (H4: Runner._invoke_tool_call) +# --------------------------------------------------------------------------- + + +class TestToolSpan: + def test_tool_span_created(self, span_exporter, instrument): + """_invoke_tool_call should produce TOOL spans.""" + from src.agent.run import Runner + + async def mock_tool(**kwargs): + return InternalResponse(data="search results") + + agent = Agent( + name="tool-agent", + tools={"search_global": mock_tool}, + tools_desc=[ + { + "type": "function", + "function": { + "name": "search_global", + "description": "Search the web", + "parameters": {}, + }, + } + ], + ) + + tc = ToolCall( + tool_name="search_global", + arguments='{"q": "AI"}', + tool_call_id="call_123", + ) + model_resp = ModelResponse( + outputs=[LLMOutputItem(tool_calls=[tc])] + ) + + _run_async(Runner._invoke_tool_call(agent, model_resp)) + + spans = span_exporter.get_finished_spans() + tool_spans = [s for s in spans if "execute_tool" in s.name] + assert len(tool_spans) == 1 + + span = tool_spans[0] + attrs = dict(span.attributes) + assert attrs.get("gen_ai.span.kind") == "TOOL" + assert attrs.get("gen_ai.operation.name") == "execute_tool" + assert attrs.get("gen_ai.tool.name") == "search_global" + assert attrs.get("gen_ai.tool.call.id") == "call_123" + assert attrs.get("gen_ai.framework") == "widesearch" + + def test_tool_span_records_arguments_and_result( + self, span_exporter, instrument + ): + """TOOL span should record arguments and result.""" + from src.agent.run import Runner + + async def mock_tool(q=""): + return InternalResponse(data=f"results for: {q}") + + agent = Agent( + name="tool-agent", + tools={"search_global": mock_tool}, + ) + + tc = ToolCall( + tool_name="search_global", + arguments=json.dumps({"q": "OpenTelemetry"}), + tool_call_id="call_456", + ) + model_resp = ModelResponse( + outputs=[LLMOutputItem(tool_calls=[tc])] + ) + + results = _run_async(Runner._invoke_tool_call(agent, model_resp)) + assert len(results) == 1 + assert results[0].content == "results for: OpenTelemetry" + + spans = span_exporter.get_finished_spans() + tool_spans = [s for s in spans if "execute_tool" in s.name] + assert len(tool_spans) == 1 + attrs = dict(tool_spans[0].attributes) + assert "gen_ai.tool.call.arguments" in attrs + assert "gen_ai.tool.call.result" in attrs + + def test_tool_span_error_on_missing_tool(self, span_exporter, instrument): + """TOOL span should record ERROR when tool not found.""" + from src.agent.run import Runner + + agent = Agent(name="tool-agent", tools={}) + + tc = ToolCall( + tool_name="nonexistent_tool", + arguments="{}", + tool_call_id="call_789", + ) + model_resp = ModelResponse( + outputs=[LLMOutputItem(tool_calls=[tc])] + ) + + results = _run_async(Runner._invoke_tool_call(agent, model_resp)) + assert len(results) == 1 + assert results[0].error_marker is not None + + spans = span_exporter.get_finished_spans() + tool_spans = [s for s in spans if "execute_tool" in s.name] + assert len(tool_spans) == 1 + assert tool_spans[0].status.status_code == StatusCode.ERROR + + def test_tool_span_error_on_exception(self, span_exporter, instrument): + """TOOL span should record ERROR when tool raises exception.""" + from src.agent.run import Runner + + async def failing_tool(**kwargs): + raise ConnectionError("Network error") + + agent = Agent( + name="tool-agent", + tools={"flaky_tool": failing_tool}, + ) + + tc = ToolCall( + tool_name="flaky_tool", + arguments="{}", + tool_call_id="call_err", + ) + model_resp = ModelResponse( + outputs=[LLMOutputItem(tool_calls=[tc])] + ) + + results = _run_async(Runner._invoke_tool_call(agent, model_resp)) + assert len(results) == 1 + assert results[0].error_marker is not None + assert "Network error" in results[0].error_marker.message + + spans = span_exporter.get_finished_spans() + tool_spans = [s for s in spans if "execute_tool" in s.name] + assert len(tool_spans) == 1 + assert tool_spans[0].status.status_code == StatusCode.ERROR + + def test_multiple_tool_spans(self, span_exporter, instrument): + """Multiple tool_calls should produce multiple TOOL spans.""" + from src.agent.run import Runner + + async def mock_search(**kwargs): + return InternalResponse(data="search result") + + async def mock_browse(**kwargs): + return InternalResponse(data="page content") + + agent = Agent( + name="multi-tool", + tools={ + "search_global": mock_search, + "text_browser_view": mock_browse, + }, + ) + + tc1 = ToolCall( + tool_name="search_global", + arguments='{"q": "test"}', + tool_call_id="call_1", + ) + tc2 = ToolCall( + tool_name="text_browser_view", + arguments='{"url": "http://example.com"}', + tool_call_id="call_2", + ) + model_resp = ModelResponse( + outputs=[LLMOutputItem(tool_calls=[tc1, tc2])] + ) + + results = _run_async(Runner._invoke_tool_call(agent, model_resp)) + assert len(results) == 2 + + spans = span_exporter.get_finished_spans() + tool_spans = [s for s in spans if "execute_tool" in s.name] + assert len(tool_spans) == 2 + + +# --------------------------------------------------------------------------- +# TASK Span Tests (H5: create_sub_agents_wrap) +# --------------------------------------------------------------------------- + + +class TestTaskSpan: + def test_task_span_created(self, span_exporter, instrument): + """create_sub_agents closure should produce a TASK span.""" + from src.agent.multi_agent_tools import create_sub_agents_wrap + + closure = create_sub_agents_wrap( + "main-agent", "gpt-4o", {}, [], "system prompt" + ) + + sub_agents = [ + {"index": 0, "prompt": "Search for X"}, + {"index": 1, "prompt": "Search for Y"}, + ] + + result = _run_async(closure(sub_agents)) + assert result is not None + + spans = span_exporter.get_finished_spans() + task_spans = [ + s for s in spans if s.name == "run_task create_sub_agents" + ] + assert len(task_spans) == 1 + + span = task_spans[0] + attrs = dict(span.attributes) + assert attrs.get("gen_ai.span.kind") == "TASK" + assert attrs.get("gen_ai.operation.name") == "run_task" + assert attrs.get("gen_ai.framework") == "widesearch" + assert "input.value" in attrs + + def test_task_span_records_output(self, span_exporter, instrument): + """TASK span should record output.value.""" + from src.agent.multi_agent_tools import create_sub_agents_wrap + + closure = create_sub_agents_wrap( + "agent", "gpt-4o", {}, [], "prompt" + ) + + sub_agents = [{"index": 0, "prompt": "find info"}] + result = _run_async(closure(sub_agents)) + + spans = span_exporter.get_finished_spans() + task_spans = [ + s for s in spans if s.name == "run_task create_sub_agents" + ] + assert len(task_spans) == 1 + attrs = dict(task_spans[0].attributes) + assert "output.value" in attrs + + def test_task_span_error(self, span_exporter, instrument): + """TASK span should record ERROR when closure raises.""" + from src.agent.multi_agent_tools import create_sub_agents_wrap + + # Temporarily replace create_sub_agents_wrap's inner closure behavior + import src.agent.multi_agent_tools as mat + + original = mat.create_sub_agents_wrap + + def error_factory(*args, **kwargs): + original_closure = original(*args, **kwargs) + + async def error_closure(sub_agents): + raise RuntimeError("Sub-agent execution failed") + + return error_closure + + mat.create_sub_agents_wrap = error_factory + + # Re-instrument to pick up the new function + from opentelemetry.instrumentation.widesearch import WideSearchInstrumentor + + instrument.uninstrument() + instrument.instrument( + tracer_provider=span_exporter._tracer_provider + if hasattr(span_exporter, "_tracer_provider") + else None, + skip_dep_check=True, + ) + + # Since re-instrumentation is complex, let's just test the wrapper directly + # by calling the instrumented version + instrument.uninstrument() + + # Simpler approach: directly test the wrap function + from opentelemetry.instrumentation.widesearch.patch import ( + wrap_create_sub_agents_factory, + ) + from opentelemetry.util.genai.extended_handler import ( + ExtendedTelemetryHandler, + ) + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import SimpleSpanProcessor + from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( + InMemorySpanExporter, + ) + + exporter = InMemorySpanExporter() + tp = TracerProvider() + tp.add_span_processor(SimpleSpanProcessor(exporter)) + handler = ExtendedTelemetryHandler(tracer_provider=tp) + + def failing_factory(*args, **kwargs): + async def failing_closure(sub_agents): + raise RuntimeError("Boom") + + return failing_closure + + wrapped_factory = wrap_create_sub_agents_factory( + failing_factory, None, (), {}, handler=handler + ) + + with pytest.raises(RuntimeError, match="Boom"): + _run_async(wrapped_factory([{"index": 0, "prompt": "x"}])) + + spans = exporter.get_finished_spans() + task_spans = [ + s for s in spans if s.name == "run_task create_sub_agents" + ] + assert len(task_spans) == 1 + assert task_spans[0].status.status_code == StatusCode.ERROR + + +# --------------------------------------------------------------------------- +# Parent-Child Relationship Tests +# --------------------------------------------------------------------------- + + +class TestParentChildRelationships: + def test_full_hierarchy_entry_agent_step(self, span_exporter, instrument): + """Full call through run_single_query should produce ENTRY > AGENT > STEP.""" + from src.agent.run import run_single_query + + _run_async(run_single_query("hierarchy test", agent_name="root")) + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans if s.name == "enter_ai_application_system" + ] + agent_spans = [s for s in spans if "invoke_agent" in s.name] + step_spans = [s for s in spans if s.name == "react step"] + + assert len(entry_spans) == 1 + assert len(agent_spans) == 1 + assert len(step_spans) >= 1 + + entry = entry_spans[0] + agent = agent_spans[0] + step = step_spans[0] + + # AGENT is child of ENTRY + assert agent.parent.span_id == entry.context.span_id + # STEP is child of AGENT + assert step.parent.span_id == agent.context.span_id + + def test_tool_span_is_child_of_step(self, span_exporter, instrument): + """TOOL span should be child of the STEP span when invoked during a step.""" + from src.agent.run import Runner + + async def mock_tool(**kwargs): + return InternalResponse(data="result") + + agent = Agent( + name="hierarchy-agent", + tools={"my_tool": mock_tool}, + ) + + async def custom_step(*, agent, memory): + tc = ToolCall( + tool_name="my_tool", + arguments="{}", + tool_call_id="tc_hier", + ) + model_resp = ModelResponse( + outputs=[LLMOutputItem(tool_calls=[tc])] + ) + await Runner._invoke_tool_call(agent, model_resp) + return ActionStep(step_status=StepStatus.FINISHED, content="done") + + Runner._step_override = custom_step + + try: + async def _run(): + async for _ in Runner.run(agent, "test"): + pass + + _run_async(_run()) + finally: + Runner._step_override = None + + spans = span_exporter.get_finished_spans() + step_spans = [s for s in spans if s.name == "react step"] + tool_spans = [s for s in spans if "execute_tool" in s.name] + + assert len(step_spans) >= 1 + assert len(tool_spans) >= 1 + + step_span = step_spans[0] + tool_span = tool_spans[0] + assert tool_span.parent.span_id == step_span.context.span_id diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/README.md b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/README.md new file mode 100644 index 000000000..1b0499fa4 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/README.md @@ -0,0 +1,55 @@ +# LoongSuite WildToolBench Instrumentation + +OpenTelemetry instrumentation for the [WildToolBench](https://github.com/yupeijei1997/WildToolBench) benchmark framework. + +## Installation + +WildToolBench is not available on PyPI. Install it from source: + +```bash +pip install -e /path/to/WildToolBench/wild-tool-bench +pip install loongsuite-instrumentation-wildtool +``` + +## Requirements + +- **OpenAI provider instrumentation**: To produce LLM spans, you must also enable an OpenAI provider instrumentation (e.g., `opentelemetry-instrumentation-openai` or LoongSuite's equivalent). This plugin creates ENTRY/AGENT/CHAIN/STEP/TOOL spans but does **not** create LLM spans itself. + +## Usage + +```python +from opentelemetry.instrumentation.wildtool import WildToolInstrumentor + +WildToolInstrumentor().instrument() + +# Run WildToolBench as usual — spans are automatically generated. +``` + +## Span Topology + +``` +ENTRY (enter_ai_application_system) +└── AGENT (invoke_agent wildtool) + └── CHAIN (workflow task_{idx}) + └── STEP (react step) + ├── [LLM span — provider instrumentation] + └── TOOL (execute_tool {tool_name}) +``` + +## Patch Points + +| # | Target | Span Type | +|---|--------|-----------| +| P1 | `multi_threaded_inference` | ENTRY | +| P2 | `BaseHandler.inference_multi_turn` | AGENT | +| P3 | `BaseHandler.inference_and_eval_multi_step` | CHAIN + TOOL | +| P4 | `BaseHandler._request_tool_call` | STEP | +| P5 | `BaseHandler._parse_api_response` | (token extraction) | + +## Round 2 fixes (see `llm-dev/execute.md` § "修订记录 (Round 2 fix)") + +- **H1**: TOOL span is now parented on STEP, not CHAIN. Strategy A enhanced — the chain wrapper holds a `round → STEP span` map and uses `trace.set_span_in_context(step_span)` to anchor each post-hoc TOOL span on the matching STEP. STEP `SpanContext`s remain valid parents even after `end()`. +- **H2 (provider-name fallback)**: `opentelemetry-instrumentation-openai-v2 == 0.62b1` only emits the legacy `gen_ai.system` attribute on its LLM span; the new `gen_ai.provider.name` attribute is missing. As a *pure fallback* the wildtool plugin writes both `gen_ai.system="openai"` and `gen_ai.provider.name="openai"` on the **STEP** span (not on the LLM span — that is owned by the OpenAI v2 instrumentation and we do **not** patch it). Once the OpenAI v2 instrumentation upstream emits `gen_ai.provider.name` natively this fallback can be removed. +- **M1**: CHAIN span now carries `input.value` (last user message in `inference_data["messages"]`, truncated to 4096 chars) and `output.value` (JSON of `action_name_label`/`task_idx`/`is_optimal`). +- **M2**: STEP span now carries `gen_ai.react.finish_reason` on error paths. Mapping table is in `execute.md` § "M2: gen_ai.react.finish_reason 取值映射". +- **M3**: TOOL span explicitly writes `gen_ai.tool.call.arguments` / `gen_ai.tool.call.result` / `gen_ai.tool.description`, bypassing `OTEL_INSTRUMENTATION_GENAI_CAPTURE_*` gating in `opentelemetry-util-genai`. The custom `wildtool.tool.execution_mode = "ground_truth_replay"` is preserved. diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/pyproject.toml new file mode 100644 index 000000000..b8f9f44d0 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/pyproject.toml @@ -0,0 +1,66 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "loongsuite-instrumentation-wildtool" +dynamic = ["version"] +description = "LoongSuite WildToolBench Instrumentation" +readme = "README.md" +license = "Apache-2.0" +requires-python = ">=3.9" +authors = [ + { name = "LoongSuite Python Agent Authors", email = "caishipeng.csp@alibaba-inc.com" }, + { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] +dependencies = [ + "opentelemetry-api ~= 1.37", + "opentelemetry-instrumentation >= 0.58b0", + "opentelemetry-semantic-conventions >= 0.58b0", + "opentelemetry-util-genai", + "wrapt >= 1.17.3, < 3.0.0", +] + +[project.optional-dependencies] +instruments = [ + "openai >= 1.0.0", +] + +test = [ + "pytest ~= 8.0", + "pytest-cov ~= 4.1.0", + "pytest-forked >= 1.6.0", + "opentelemetry-sdk >= 1.37", + "openai >= 1.0.0", +] + +[project.entry-points.opentelemetry_instrumentor] +wildtool = "opentelemetry.instrumentation.wildtool:WildToolInstrumentor" + +[project.urls] +Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-wildtool" +Repository = "https://github.com/alibaba/loongsuite-python-agent" + +[tool.hatch.version] +path = "src/opentelemetry/instrumentation/wildtool/version.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/opentelemetry"] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/__init__.py new file mode 100644 index 000000000..dad772500 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/__init__.py @@ -0,0 +1,161 @@ +"""OpenTelemetry WildToolBench Instrumentation""" + +import logging +from typing import Any, Collection + +from wrapt import wrap_function_wrapper + +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor +from opentelemetry.instrumentation.utils import unwrap +from opentelemetry.instrumentation.wildtool.package import _instruments +from opentelemetry.instrumentation.wildtool.version import __version__ +from opentelemetry.instrumentation.wildtool._wrappers import ( + WildToolAgentWrapper, + WildToolChainWrapper, + WildToolEntryWrapper, + WildToolParseWrapper, + WildToolRequestWrapper, +) +from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler + +logger = logging.getLogger(__name__) + +_LLM_RESPONSE_GEN_MODULE = "wtb._llm_response_generation" +_BASE_HANDLER_MODULE = "wtb.model_handler.base_handler" + +__all__ = ["WildToolInstrumentor", "__version__"] + + +class WildToolInstrumentor(BaseInstrumentor): + """OpenTelemetry instrumentor for WildToolBench framework.""" + + def __init__(self): + super().__init__() + self._handler = None + # Track concrete handler subclasses whose abstract _request_tool_call / + # _parse_api_response we have already wrapped, so we can unwrap on + # uninstrument and avoid double-wrapping. + self._patched_handler_classes: set = set() + self._request_wrapper = None + self._parse_wrapper = None + + def instrumentation_dependencies(self) -> Collection[str]: + return _instruments + + def _instrument(self, **kwargs: Any) -> None: + tracer_provider = kwargs.get("tracer_provider") + meter_provider = kwargs.get("meter_provider") + logger_provider = kwargs.get("logger_provider") + + self._handler = ExtendedTelemetryHandler( + tracer_provider=tracer_provider, + meter_provider=meter_provider, + logger_provider=logger_provider, + ) + self._request_wrapper = WildToolRequestWrapper(self._handler) + self._parse_wrapper = WildToolParseWrapper(self._handler) + + # P1: ENTRY span + try: + wrap_function_wrapper( + _LLM_RESPONSE_GEN_MODULE, + "multi_threaded_inference", + WildToolEntryWrapper(self._handler), + ) + except Exception as e: + logger.warning("Failed to instrument multi_threaded_inference: %s", e) + + # P2: AGENT span + try: + wrap_function_wrapper( + _BASE_HANDLER_MODULE, + "BaseHandler.inference_multi_turn", + WildToolAgentWrapper(self._handler), + ) + except Exception as e: + logger.warning("Failed to instrument inference_multi_turn: %s", e) + + # P3: CHAIN span (+ STEP + TOOL management). + # The chain wrapper also lazily patches the concrete subclass' + # `_request_tool_call` / `_parse_api_response` on first use, so that + # subclasses overriding the abstract base methods are still + # intercepted (P4 / P5). + try: + wrap_function_wrapper( + _BASE_HANDLER_MODULE, + "BaseHandler.inference_and_eval_multi_step", + WildToolChainWrapper(self._handler, self), + ) + except Exception as e: + logger.warning( + "Failed to instrument inference_and_eval_multi_step: %s", e + ) + + def ensure_handler_class_patched(self, handler_cls) -> None: + """Lazily wrap the concrete handler subclass' P4/P5 methods. + + WildToolBench declares ``_request_tool_call`` and ``_parse_api_response`` + as abstract on ``BaseHandler``, but real handlers (and tests) override + them. Python method resolution dispatches directly to the override and + therefore never reaches a wrapper installed on the base class. We + instead wrap the override on first invocation per subclass. + """ + if handler_cls in self._patched_handler_classes: + return + self._patched_handler_classes.add(handler_cls) + + module_name = handler_cls.__module__ + cls_name = handler_cls.__name__ + for method, wrapper in ( + ("_request_tool_call", self._request_wrapper), + ("_parse_api_response", self._parse_wrapper), + ): + if method not in handler_cls.__dict__: + continue + try: + wrap_function_wrapper( + module_name, + f"{cls_name}.{method}", + wrapper, + ) + except Exception as e: + logger.debug( + "Failed to wrap %s.%s.%s: %s", + module_name, + cls_name, + method, + e, + ) + + def _uninstrument(self, **kwargs: Any) -> None: + try: + import wtb._llm_response_generation as llm_gen + + unwrap(llm_gen, "multi_threaded_inference") + except Exception as e: + logger.debug("Failed to uninstrument multi_threaded_inference: %s", e) + + try: + import wtb.model_handler.base_handler as bh + + unwrap(bh.BaseHandler, "inference_multi_turn") + unwrap(bh.BaseHandler, "inference_and_eval_multi_step") + except Exception as e: + logger.debug("Failed to uninstrument BaseHandler methods: %s", e) + + for cls in list(self._patched_handler_classes): + for method in ("_request_tool_call", "_parse_api_response"): + if method in cls.__dict__: + try: + unwrap(cls, method) + except Exception as e: + logger.debug( + "Failed to unwrap %s.%s: %s", + cls.__name__, + method, + e, + ) + self._patched_handler_classes.clear() + self._request_wrapper = None + self._parse_wrapper = None + self._handler = None diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/_wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/_wrappers.py new file mode 100644 index 000000000..612a332ab --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/_wrappers.py @@ -0,0 +1,644 @@ +"""Wrapper classes for WildToolBench instrumentation. + +Each wrapper corresponds to one patch point and manages the lifecycle +of one or more span types. + +Round 2 fix highlights (see ``llm-dev/execute.md`` § "修订记录 (Round 2 fix)"): + +H1 + TOOL span parent is now STEP rather than CHAIN. Each STEP invocation is + appended to a per-chain list in :data:`_chain_step_invocations`; when the + chain wrapper post-processes ``inference_log`` it looks up the matching + STEP span by ``round`` and uses + :func:`opentelemetry.trace.set_span_in_context` so ``start_execute_tool`` + parents the TOOL span on the STEP context (even if STEP is already + closed — its :class:`SpanContext` remains a valid parent reference). + +H2 + The OpenAI v2 provider instrumentation (0.62b1) writes only the legacy + ``gen_ai.system`` attribute to its LLM span. The wildtool plugin now + writes both ``gen_ai.system`` and ``gen_ai.provider.name`` on the STEP + span as a fallback so the new semantic-conventions attribute is present + in the trace tree even before the upstream OpenAI v2 instrumentation + catches up. We do **not** patch the OpenAI v2 instrumentation itself. + +M1 + ``input.value`` (last user message in the chain's ``messages``, truncated + to 4096 chars) and ``output.value`` (a JSON of action label, task index + and is_optimal) are written on the CHAIN span. + +M2 + ``gen_ai.react.finish_reason`` is derived from ``inference_log`` on the + *last* (currently active) STEP. Mappings: + + ``"parse_tool_calls_failed"`` + ``error_reason`` contains "parse tool_calls failed". + ``"action_name_mismatch"`` + ``error_reason`` contains "action name not in candidate". + ``"empty_response"`` + ``error_reason`` contains "tool_calls and content are None". + ``"error"`` + request raised an exception (handled in + :class:`WildToolRequestWrapper`). + +M3 + ``gen_ai.tool.call.arguments``, ``gen_ai.tool.call.result`` and + ``gen_ai.tool.description`` are written explicitly on TOOL spans + *before* close as a fallback. ``opentelemetry-util-genai`` gates these + sensitive attributes behind ``OTEL_INSTRUMENTATION_GENAI_CAPTURE_*`` env + vars; the wildtool plugin always writes them since wtb data is + benchmark-synthetic and never PII. +""" + +import json +import logging +from contextvars import ContextVar +from typing import List, Optional + +from opentelemetry.trace import StatusCode, set_span_in_context +from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler +from opentelemetry.util.genai.extended_types import ( + EntryInvocation, + ExecuteToolInvocation, + InvokeAgentInvocation, + ReactStepInvocation, +) +from opentelemetry.util.genai.types import Error + +logger = logging.getLogger(__name__) + +# ─────────────────────────── ContextVars ─────────────────────────────── +# The CHAIN wrapper opens a new logical "chain" by flipping ``_in_chain`` +# and resetting the counter. The REQUEST wrapper reads these to decide +# whether to create a STEP span and what round number to assign. +_in_chain: ContextVar[bool] = ContextVar("_wt_in_chain", default=False) + +# Currently open STEP invocation. Used by the parse wrapper to attach +# token attributes to the right span. +_step_invocation: ContextVar[Optional[ReactStepInvocation]] = ContextVar( + "_wt_step_inv", default=None +) +_step_counter: ContextVar[int] = ContextVar("_wt_step_ctr", default=0) + +# Per-chain list of every STEP invocation created in the current chain +# (in `round` order). The chain wrapper allocates this list on entry and +# uses it after ``wrapped`` returns to re-parent TOOL spans onto the +# matching STEP. Even if a STEP span is already ``end()``-ed, its +# :class:`SpanContext` stays valid as a parent reference for new spans. +_chain_step_invocations: ContextVar[Optional[List[ReactStepInvocation]]] = ( + ContextVar("_wt_chain_step_invs", default=None) +) + +_PROVIDER_FALLBACK_NAME = "openai" +_INPUT_VALUE_MAX_CHARS = 4096 + + +def _close_active_step(handler: ExtendedTelemetryHandler) -> None: + """Close the currently active STEP span, if any.""" + prev = _step_invocation.get() + if prev is not None: + try: + handler.stop_react_step(prev) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to close step: %s", e) + _step_invocation.set(None) + + +def _truncate(text: str, max_chars: int) -> str: + if len(text) <= max_chars: + return text + return text[:max_chars] + "...(truncated)" + + +def _stringify(value) -> str: + if isinstance(value, str): + return value + try: + return json.dumps(value, ensure_ascii=False) + except (TypeError, ValueError): + return str(value) + + +class WildToolEntryWrapper: + """P1: Wraps multi_threaded_inference → ENTRY span.""" + + def __init__(self, handler: ExtendedTelemetryHandler): + self._handler = handler + + def __call__(self, wrapped, instance, args, kwargs): + # Signature: multi_threaded_inference(handler, model_name, test_case). + # We only need model_name and test_case for ENTRY attributes; the + # handler instance flows through as args[0] untouched. + model_name = args[1] if len(args) > 1 else kwargs.get("model_name", "") + test_case = args[2] if len(args) > 2 else kwargs.get("test_case", {}) + + invocation = EntryInvocation( + session_id=test_case.get("id"), + attributes={ + "gen_ai.framework": "wildtool", + "gen_ai.request.model": model_name, + "wildtool.turn_count": len(test_case.get("english_tasks", [])), + }, + ) + self._handler.start_entry(invocation) + try: + result = wrapped(*args, **kwargs) + self._handler.stop_entry(invocation) + return result + except Exception as e: + self._handler.fail_entry( + invocation, Error(message=str(e), type=type(e)) + ) + raise + + +class WildToolAgentWrapper: + """P2: Wraps BaseHandler.inference_multi_turn → AGENT span.""" + + def __init__(self, handler: ExtendedTelemetryHandler): + self._handler = handler + + def __call__(self, wrapped, instance, args, kwargs): + test_entry = args[0] if args else kwargs.get("test_entry", {}) + + invocation = InvokeAgentInvocation( + provider=None, + agent_name=type(instance).__name__, + conversation_id=test_entry.get("id"), + request_model=getattr(instance, "model_name", None), + attributes={ + "gen_ai.framework": "wildtool", + "wildtool.turn_count": len( + test_entry.get("english_answer_list", []) + ), + }, + ) + self._handler.start_invoke_agent(invocation) + try: + result = wrapped(*args, **kwargs) + total_input = 0 + total_output = 0 + for task_result in (result or []): + if isinstance(task_result, dict): + total_input += sum( + task_result.get("input_token_count", []) + ) + total_output += sum( + task_result.get("output_token_count", []) + ) + if total_input: + invocation.input_tokens = total_input + if total_output: + invocation.output_tokens = total_output + self._handler.stop_invoke_agent(invocation) + return result + except Exception as e: + self._handler.fail_invoke_agent( + invocation, Error(message=str(e), type=type(e)) + ) + raise + + +class WildToolChainWrapper: + """P3: Wraps BaseHandler.inference_and_eval_multi_step → CHAIN span. + + Also manages the lifecycle of the final STEP span and creates TOOL spans + from the returned ``inference_log`` after the original function completes. + Round 2 fixes (H1/M1/M2/M3) are implemented here. + """ + + def __init__(self, handler: ExtendedTelemetryHandler, instrumentor=None): + self._handler = handler + self._instrumentor = instrumentor + + def __call__(self, wrapped, instance, args, kwargs): + if self._instrumentor is not None and instance is not None: + try: + self._instrumentor.ensure_handler_class_patched(type(instance)) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to ensure subclass patched: %s", e) + + inference_data = args[0] if args else kwargs.get("inference_data", {}) + if not isinstance(inference_data, dict): + inference_data = {} + task_idx = inference_data.get("task_idx", 0) + test_entry_id = inference_data.get("test_entry_id", "") + + span_name = f"workflow task_{task_idx}" + tracer = self._handler._tracer + + chain_token = _in_chain.set(True) + counter_token = _step_counter.set(0) + step_token = _step_invocation.set(None) + chain_steps: List[ReactStepInvocation] = [] + chain_steps_token = _chain_step_invocations.set(chain_steps) + + chain_attributes = { + "gen_ai.span.kind": "CHAIN", + "gen_ai.operation.name": "workflow", + "gen_ai.framework": "wildtool", + "wildtool.task_idx": task_idx, + "wildtool.test_entry_id": test_entry_id, + } + + # M1: Capture last user message as ``input.value`` BEFORE running the + # wrapped function (the wtb function mutates ``messages`` in place). + input_value = self._extract_input_value(inference_data) + if input_value is not None: + chain_attributes["input.value"] = input_value + + with tracer.start_as_current_span( + name=span_name, attributes=chain_attributes + ) as span: + try: + result = wrapped(*args, **kwargs) + + # M2: Set finish_reason on the currently active (last) STEP + # BEFORE we close it. Only the terminal step ever carries an + # error finish_reason (every wtb error path triggers `break`). + if isinstance(result, dict): + self._apply_last_step_finish_reason( + result.get("inference_log", {}) + ) + + _close_active_step(self._handler) + + if isinstance(result, dict): + label = result.get("action_name_label", "") + is_optimal = bool(result.get("is_optimal", False)) + span.set_attribute("wildtool.action_name_label", label) + span.set_attribute("wildtool.is_optimal", is_optimal) + + # M1: ``output.value`` summarising chain outcome. + try: + span.set_attribute( + "output.value", + json.dumps( + { + "action_name_label": label, + "task_idx": task_idx, + "is_optimal": is_optimal, + }, + ensure_ascii=False, + ), + ) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to set output.value: %s", e) + + # H1 + M3: re-parent TOOL spans on STEP and force-write + # tool call sensitive attributes. + self._create_tool_spans_from_log( + result.get("inference_log", {}), + inference_data, + chain_steps, + ) + + span.set_status(StatusCode.OK) + return result + except Exception as e: + _close_active_step(self._handler) + span.record_exception(e) + span.set_status(StatusCode.ERROR) + raise + finally: + _chain_step_invocations.reset(chain_steps_token) + _step_counter.reset(counter_token) + _step_invocation.reset(step_token) + _in_chain.reset(chain_token) + + # -- M1 --------------------------------------------------------------- + + @staticmethod + def _extract_input_value(inference_data) -> Optional[str]: + msgs = inference_data.get("messages") if isinstance( + inference_data, dict + ) else None + if not isinstance(msgs, list): + return None + for m in reversed(msgs): + if not isinstance(m, dict) or m.get("role") != "user": + continue + content = m.get("content") + if content is None: + continue + text = _stringify(content) + return _truncate(text, _INPUT_VALUE_MAX_CHARS) + return None + + # -- M2 --------------------------------------------------------------- + + def _apply_last_step_finish_reason(self, inference_log) -> None: + if not isinstance(inference_log, dict): + return + current_step = _step_invocation.get() + if current_step is None or current_step.round is None: + return + step_key = f"step_{current_step.round - 1}" + step_data = inference_log.get(step_key) + if not isinstance(step_data, dict): + return + output = step_data.get("inference_output") or {} + if not isinstance(output, dict): + return + label = output.get("current_action_name_label") + error_reason = output.get("error_reason") or "" + reason = self._derive_step_finish_reason(label, error_reason) + if reason is None: + return + # Setting `invocation.finish_reason` is enough — the util-genai + # `_apply_react_step_finish_attributes` writes + # ``gen_ai.react.finish_reason`` from this field on stop. + current_step.finish_reason = reason + + @staticmethod + def _derive_step_finish_reason( + label, error_reason: str + ) -> Optional[str]: + """Map wtb inference_log error_reason → gen_ai.react.finish_reason.""" + if label != "error": + return None + if "parse tool_calls failed" in error_reason: + return "parse_tool_calls_failed" + if "action name not in candidate" in error_reason: + return "action_name_mismatch" + if "tool_calls and content are None" in error_reason: + return "empty_response" + return "error" + + # -- H1 + M3 ---------------------------------------------------------- + + def _create_tool_spans_from_log( + self, + inference_log, + inference_data, + chain_steps: List[ReactStepInvocation], + ) -> None: + """Post-hoc TOOL span creation from inference_log. + + Uses the per-chain STEP invocation list to parent each TOOL span on + the matching STEP span (H1). Sensitive tool-call attributes are + written explicitly on the span (M3) so they appear regardless of + ``OTEL_INSTRUMENTATION_GENAI_CAPTURE_*`` settings. + """ + if not isinstance(inference_log, dict): + return + + # round → SpanContext-bearing OTel context for parenting + step_ctx_by_round = {} + for step_inv in chain_steps: + if step_inv.round is None or step_inv.span is None: + continue + try: + step_ctx_by_round[step_inv.round] = set_span_in_context( + step_inv.span + ) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to compute step parent context: %s", e) + + # tool name → description (for gen_ai.tool.description) + tool_desc_map = {} + tools = inference_data.get("tools") if isinstance( + inference_data, dict + ) else None + if isinstance(tools, list): + for tool in tools: + if not isinstance(tool, dict): + continue + func = tool.get("function") or tool + if not isinstance(func, dict): + continue + name = func.get("name") + desc = func.get("description") + if name: + tool_desc_map[name] = desc + + # Extract tool observations from final messages keyed by tool_call_id; + # wtb only embeds them in messages (not in inference_answer) for the + # tool_call branch. + observation_by_call_id = {} + messages = inference_data.get("messages") if isinstance( + inference_data, dict + ) else None + if isinstance(messages, list): + for msg in messages: + if not isinstance(msg, dict) or msg.get("role") != "tool": + continue + tid = msg.get("tool_call_id") + if tid is None: + continue + content = msg.get("content") + if content is None: + continue + observation_by_call_id[tid] = ( + content if isinstance(content, str) else _stringify(content) + ) + + for key in sorted(k for k in inference_log if k.startswith("step_")): + try: + step_idx = int(key[len("step_"):]) + except ValueError: + continue + round_num = step_idx + 1 + + step_data = inference_log[key] + if not isinstance(step_data, dict): + continue + output = step_data.get("inference_output") or {} + if not isinstance(output, dict): + continue + tool_calls = output.get("tool_calls") + label = output.get("current_action_name_label") + if not tool_calls or label != "correct": + continue + + answer_data = step_data.get("inference_answer") or {} + candidate = ( + answer_data.get("candidate_0_answer_function_list") + if isinstance(answer_data, dict) + else None + ) or {} + candidate_observation = ( + candidate.get("observation") + if isinstance(candidate, dict) + else None + ) + + parent_ctx = step_ctx_by_round.get(round_num) + + for tc in tool_calls: + if not isinstance(tc, dict): + continue + func = tc.get("function") or {} + if not isinstance(func, dict): + func = {} + tool_name = func.get("name", "unknown") + tool_id = tc.get("id") + tool_args_raw = func.get("arguments", "") + tool_args_str = ( + tool_args_raw + if isinstance(tool_args_raw, str) + else _stringify(tool_args_raw) + ) + + observation_str: Optional[str] = None + if tool_id is not None and tool_id in observation_by_call_id: + observation_str = observation_by_call_id[tool_id] + elif candidate_observation is not None: + observation_str = ( + candidate_observation + if isinstance(candidate_observation, str) + else _stringify(candidate_observation) + ) + + description = tool_desc_map.get(tool_name) + + invocation = ExecuteToolInvocation( + tool_name=tool_name, + tool_call_id=tool_id, + tool_call_arguments=tool_args_str, + tool_call_result=observation_str, + tool_type="function", + tool_description=description, + attributes={ + "wildtool.tool.execution_mode": "ground_truth_replay", + }, + ) + + try: + self._handler.start_execute_tool( + invocation, context=parent_ctx + ) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to start_execute_tool: %s", e) + continue + + # M3: explicitly write tool_call sensitive attrs. The + # util-genai `_get_tool_call_data_attributes` helper guards + # these behind experimental-mode + content-capture-mode env + # vars which are not always set in real deployments. + tool_span = invocation.span + if tool_span is not None and tool_span.is_recording(): + try: + tool_span.set_attribute( + "gen_ai.tool.call.arguments", tool_args_str + ) + if observation_str is not None: + tool_span.set_attribute( + "gen_ai.tool.call.result", observation_str + ) + if description: + tool_span.set_attribute( + "gen_ai.tool.description", description + ) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to set tool span attrs: %s", e) + + try: + self._handler.stop_execute_tool(invocation) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to stop_execute_tool: %s", e) + + +class WildToolRequestWrapper: + """P4: Wraps BaseHandler._request_tool_call. + + Creates STEP span (ReactStepInvocation) before each LLM call. + Extracts latency from return value. Also writes the H2 provider-name + fallback attributes (``gen_ai.system`` + ``gen_ai.provider.name``) on + the STEP span so the new semconv attribute is present in the trace + even when the upstream OpenAI v2 instrumentation only emits the legacy + ``gen_ai.system``. + """ + + def __init__(self, handler: ExtendedTelemetryHandler): + self._handler = handler + + def __call__(self, wrapped, instance, args, kwargs): + if not _in_chain.get(): + return wrapped(*args, **kwargs) + + # Close the previous step (the natural end-of-step is when the next + # request fires). The STEP span's SpanContext stays valid as a + # parent for TOOL spans created later. + _close_active_step(self._handler) + + step_num = _step_counter.get() + 1 + _step_counter.set(step_num) + + step_inv = ReactStepInvocation(round=step_num) + try: + self._handler.start_react_step(step_inv) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to start react step: %s", e) + return wrapped(*args, **kwargs) + + # H2: provider-name fallback attributes. Written on the STEP, not + # on the LLM span, because the LLM span is owned by the OpenAI v2 + # provider instrumentation and is created lazily inside the wtb + # request implementation. + if step_inv.span is not None and step_inv.span.is_recording(): + try: + step_inv.span.set_attribute( + "gen_ai.system", _PROVIDER_FALLBACK_NAME + ) + step_inv.span.set_attribute( + "gen_ai.provider.name", _PROVIDER_FALLBACK_NAME + ) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to set provider fallback attrs: %s", e) + + # Track this step for H1 TOOL re-parenting. + chain_steps = _chain_step_invocations.get() + if chain_steps is not None: + chain_steps.append(step_inv) + _step_invocation.set(step_inv) + + try: + result = wrapped(*args, **kwargs) + if isinstance(result, tuple) and len(result) == 2: + _, latency = result + if step_inv.span and step_inv.span.is_recording(): + try: + step_inv.span.set_attribute( + "wildtool.latency", float(latency) + ) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to set wildtool.latency: %s", e) + return result + except Exception as e: + step_inv.finish_reason = "error" + self._handler.fail_react_step( + step_inv, Error(message=str(e), type=type(e)) + ) + _step_invocation.set(None) + raise + + +class WildToolParseWrapper: + """P5: Wraps BaseHandler._parse_api_response. + + Extracts token counts from parsed response and sets them on the + current STEP span as attributes. + """ + + def __init__(self, handler: ExtendedTelemetryHandler): + self._handler = handler + + def __call__(self, wrapped, instance, args, kwargs): + result = wrapped(*args, **kwargs) + + step_inv = _step_invocation.get() + if step_inv and step_inv.span and step_inv.span.is_recording(): + if isinstance(result, dict): + input_t = result.get("input_token") + output_t = result.get("output_token") + if input_t is not None: + step_inv.span.set_attribute( + "gen_ai.usage.input_tokens", input_t + ) + if output_t is not None: + step_inv.span.set_attribute( + "gen_ai.usage.output_tokens", output_t + ) + + return result diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/package.py new file mode 100644 index 000000000..1ac5bcfee --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/package.py @@ -0,0 +1,2 @@ +_instruments = ("openai >= 1.0.0",) +_supports_metrics = False diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/utils.py new file mode 100644 index 000000000..c26b7711d --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/utils.py @@ -0,0 +1,17 @@ +"""Utility functions for WildToolBench instrumentation.""" + +import json +from typing import Any, Optional + + +def safe_json_dumps(obj: Any, max_length: int = 10000) -> Optional[str]: + """Safely serialize object to JSON string with length limit.""" + if obj is None: + return None + try: + s = json.dumps(obj, ensure_ascii=False) + if len(s) > max_length: + return s[:max_length] + "...(truncated)" + return s + except (TypeError, ValueError): + return str(obj)[:max_length] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/version.py new file mode 100644 index 000000000..3dc1f76bc --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/version.py @@ -0,0 +1 @@ +__version__ = "0.1.0" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/conftest.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/conftest.py new file mode 100644 index 000000000..014186185 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/conftest.py @@ -0,0 +1,182 @@ +"""Test configuration for WildToolBench instrumentation tests.""" + +import json +import os + +import pytest + +os.environ.setdefault("OPENAI_API_KEY", "test_key_not_real") +os.environ.setdefault("OPENAI_BASE_URL", "http://localhost:9999/v1") + +from opentelemetry.instrumentation.wildtool import WildToolInstrumentor +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import SimpleSpanProcessor +from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( + InMemorySpanExporter, +) + + +def pytest_configure(config: pytest.Config): + os.environ["OTEL_SEMCONV_STABILITY_OPT_IN"] = "gen_ai_latest_experimental" + + +@pytest.fixture(scope="function", name="span_exporter") +def fixture_span_exporter(): + exporter = InMemorySpanExporter() + yield exporter + + +@pytest.fixture(scope="function", name="tracer_provider") +def fixture_tracer_provider(span_exporter): + provider = TracerProvider() + provider.add_span_processor(SimpleSpanProcessor(span_exporter)) + return provider + + +@pytest.fixture(scope="function") +def instrument(tracer_provider): + instrumentor = WildToolInstrumentor() + instrumentor.instrument( + tracer_provider=tracer_provider, + skip_dep_check=True, + ) + yield instrumentor + instrumentor.uninstrument() + + +# ==================== Minimal test data fixtures ==================== + + +def _make_chat_completion_response( + content=None, + tool_calls=None, + input_tokens=10, + output_tokens=5, + model="gpt-4o", +): + """Build a minimal ChatCompletion-like dict that can be JSON-serialized.""" + message = {"role": "assistant", "content": content or ""} + if tool_calls: + message["tool_calls"] = tool_calls + return { + "id": "chatcmpl-test", + "object": "chat.completion", + "model": model, + "choices": [{"index": 0, "message": message, "finish_reason": "stop"}], + "usage": { + "prompt_tokens": input_tokens, + "completion_tokens": output_tokens, + "total_tokens": input_tokens + output_tokens, + }, + } + + +class FakeChatCompletion: + """Mimics openai.types.chat.ChatCompletion enough for _parse_api_response.""" + + def __init__(self, data: dict): + self._data = data + + def json(self): + return json.dumps(self._data) + + def __getattr__(self, name): + return self._data[name] + + +@pytest.fixture() +def make_completion(): + """Factory fixture to build FakeChatCompletion objects.""" + + def _factory(**kwargs): + return FakeChatCompletion(_make_chat_completion_response(**kwargs)) + + return _factory + + +@pytest.fixture() +def simple_test_entry(): + """A minimal WildToolBench test_entry with 1 task, 1 step (prepare_to_answer).""" + return { + "id": "wild_tool_bench_test_001", + "english_env_info": "2025-01-01", + "english_tools": [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather for a city", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string"}, + }, + "required": ["city"], + }, + }, + } + ], + "english_tasks": ["What is the weather in Beijing?"], + "english_answer_list": [ + [ + { + "action": { + "name": "get_weather", + "arguments": {"city": "Beijing"}, + }, + "observation": "Sunny, 25°C", + "dependency_list": [], + }, + { + "action": { + "name": "prepare_to_answer", + "arguments": {}, + }, + "observation": "The weather in Beijing is Sunny, 25°C", + "dependency_list": [0], + }, + ] + ], + } + + +@pytest.fixture() +def tool_call_response_factory(): + """Factory to make tool_call ChatCompletion responses.""" + + def _factory(tool_name, arguments, tool_call_id="call_001"): + tc = [ + { + "id": tool_call_id, + "type": "function", + "function": { + "name": tool_name, + "arguments": ( + json.dumps(arguments) + if isinstance(arguments, dict) + else arguments + ), + }, + } + ] + return FakeChatCompletion( + _make_chat_completion_response(tool_calls=tc) + ) + + return _factory + + +@pytest.fixture() +def text_response_factory(): + """Factory to make text-only ChatCompletion responses.""" + + def _factory(content, input_tokens=10, output_tokens=5): + return FakeChatCompletion( + _make_chat_completion_response( + content=content, + input_tokens=input_tokens, + output_tokens=output_tokens, + ) + ) + + return _factory diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_agent_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_agent_span.py new file mode 100644 index 000000000..2929eeb33 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_agent_span.py @@ -0,0 +1,108 @@ +"""Tests for AGENT span (P2: inference_multi_turn).""" + +import json +from unittest.mock import patch + +import pytest +from opentelemetry.trace import StatusCode + +from wtb.model_handler.base_handler import BaseHandler + + +class _StubHandler(BaseHandler): + """Minimal handler subclass for testing AGENT span.""" + + def __init__(self): + super().__init__("test-model", 0.0) + self._step_responses = [] + self._step_idx = 0 + + def _request_tool_call(self, inference_data): + resp = self._step_responses[self._step_idx] + self._step_idx += 1 + return resp, 0.1 + + def _parse_api_response(self, api_response): + data = json.loads(api_response.json()) + choice = data["choices"][0] + message = choice["message"] + return { + "reasoning_content": None, + "content": message.get("content"), + "tool_calls": message.get("tool_calls"), + "input_token": data["usage"]["prompt_tokens"], + "output_token": data["usage"]["completion_tokens"], + } + + +class TestAgentSpan: + def test_agent_span_attributes( + self, span_exporter, instrument, simple_test_entry, make_completion, + tool_call_response_factory, text_response_factory, + ): + """AGENT span should exist with correct attributes and token aggregation.""" + handler = _StubHandler() + + # Step 0: model returns tool call for get_weather + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + # Step 1: model returns text (prepare_to_answer match) + resp1 = text_response_factory( + "The weather in Beijing is Sunny, 25°C", + input_tokens=20, output_tokens=15, + ) + handler._step_responses = [resp0, resp1] + + result = handler.inference_multi_turn(simple_test_entry) + assert result is not None + + spans = span_exporter.get_finished_spans() + agent_spans = [s for s in spans if "invoke_agent" in s.name] + assert len(agent_spans) == 1 + + span = agent_spans[0] + assert span.name == "invoke_agent _StubHandler" + attrs = dict(span.attributes or {}) + assert attrs.get("gen_ai.span.kind") == "AGENT" + assert attrs.get("gen_ai.operation.name") == "invoke_agent" + assert attrs.get("gen_ai.framework") == "wildtool" + assert attrs.get("gen_ai.agent.name") == "_StubHandler" + assert attrs.get("gen_ai.conversation.id") == "wild_tool_bench_test_001" + assert attrs.get("gen_ai.request.model") == "test-model" + assert attrs.get("wildtool.turn_count") == 1 + + assert attrs.get("gen_ai.usage.input_tokens") == 30 + assert attrs.get("gen_ai.usage.output_tokens") == 20 + + def test_agent_parent_is_entry( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """When called via multi_threaded_inference, AGENT span should be child of ENTRY.""" + from wtb._llm_response_generation import multi_threaded_inference + + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + test_case = simple_test_entry.copy() + multi_threaded_inference(handler, "test-model", test_case) + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans if s.name == "enter_ai_application_system" + ] + agent_spans = [s for s in spans if "invoke_agent" in s.name] + + assert len(entry_spans) == 1 + assert len(agent_spans) == 1 + + entry = entry_spans[0] + agent = agent_spans[0] + assert agent.context.trace_id == entry.context.trace_id + assert agent.parent is not None + assert agent.parent.span_id == entry.context.span_id diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_chain_step_tool_spans.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_chain_step_tool_spans.py new file mode 100644 index 000000000..d7dd7b4aa --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_chain_step_tool_spans.py @@ -0,0 +1,283 @@ +"""Tests for CHAIN / STEP / TOOL spans (P3, P4, P5).""" + +import json + +import pytest +from opentelemetry.trace import StatusCode + +from wtb.model_handler.base_handler import BaseHandler + + +class _StubHandler(BaseHandler): + """Minimal handler subclass with controllable responses.""" + + def __init__(self): + super().__init__("test-model", 0.0) + self._step_responses = [] + self._step_idx = 0 + + def _request_tool_call(self, inference_data): + resp = self._step_responses[self._step_idx] + self._step_idx += 1 + return resp, 0.05 + + def _parse_api_response(self, api_response): + data = json.loads(api_response.json()) + choice = data["choices"][0] + message = choice["message"] + return { + "reasoning_content": None, + "content": message.get("content"), + "tool_calls": message.get("tool_calls"), + "input_token": data["usage"]["prompt_tokens"], + "output_token": data["usage"]["completion_tokens"], + } + + +class TestChainSpan: + def test_chain_span_per_task( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """Each task should produce one CHAIN span with correct attributes.""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + chain_spans = [s for s in spans if s.name.startswith("workflow")] + assert len(chain_spans) == 1 + + chain = chain_spans[0] + assert chain.name == "workflow task_0" + attrs = dict(chain.attributes or {}) + assert attrs.get("gen_ai.span.kind") == "CHAIN" + assert attrs.get("gen_ai.operation.name") == "workflow" + assert attrs.get("gen_ai.framework") == "wildtool" + assert attrs.get("wildtool.task_idx") == 0 + assert attrs.get("wildtool.test_entry_id") == "wild_tool_bench_test_001" + assert attrs.get("wildtool.action_name_label") == "correct" + assert attrs.get("wildtool.is_optimal") is True + + def test_chain_parent_is_agent( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """CHAIN span should be child of AGENT span.""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + agent_spans = [s for s in spans if "invoke_agent" in s.name] + chain_spans = [s for s in spans if s.name.startswith("workflow")] + + assert len(agent_spans) == 1 + assert len(chain_spans) == 1 + + agent = agent_spans[0] + chain = chain_spans[0] + assert chain.context.trace_id == agent.context.trace_id + assert chain.parent is not None + assert chain.parent.span_id == agent.context.span_id + + +class TestStepSpans: + def test_step_spans_per_chain( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """Each _request_tool_call invocation should produce a STEP span.""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + step_spans = [s for s in spans if s.name == "react step"] + assert len(step_spans) == 2 + + attrs0 = dict(step_spans[0].attributes or {}) + attrs1 = dict(step_spans[1].attributes or {}) + rounds = sorted([attrs0.get("gen_ai.react.round"), attrs1.get("gen_ai.react.round")]) + assert rounds == [1, 2] + + for ss in step_spans: + a = dict(ss.attributes or {}) + assert a.get("gen_ai.span.kind") == "STEP" + assert a.get("gen_ai.operation.name") == "react" + + def test_step_parent_is_chain( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """STEP spans should be children of CHAIN span.""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + chain_spans = [s for s in spans if s.name.startswith("workflow")] + step_spans = [s for s in spans if s.name == "react step"] + + assert len(chain_spans) == 1 + chain = chain_spans[0] + + for ss in step_spans: + assert ss.context.trace_id == chain.context.trace_id + assert ss.parent is not None + assert ss.parent.span_id == chain.context.span_id + + def test_step_token_attributes( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """STEP span should have gen_ai.usage.input_tokens and output_tokens.""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory( + "The weather in Beijing is Sunny, 25°C", + input_tokens=25, output_tokens=12, + ) + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + step_spans = sorted( + [s for s in spans if s.name == "react step"], + key=lambda s: s.attributes.get("gen_ai.react.round", 0), + ) + assert len(step_spans) == 2 + + # First step: default 10 input, 5 output from make_completion defaults + a0 = dict(step_spans[0].attributes or {}) + assert a0.get("gen_ai.usage.input_tokens") == 10 + assert a0.get("gen_ai.usage.output_tokens") == 5 + + # Second step: 25 input, 12 output + a1 = dict(step_spans[1].attributes or {}) + assert a1.get("gen_ai.usage.input_tokens") == 25 + assert a1.get("gen_ai.usage.output_tokens") == 12 + + +class TestToolSpans: + def test_tool_span_attributes( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """TOOL span should have correct attributes including execution_mode.""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + tool_spans = [s for s in spans if "execute_tool" in s.name] + assert len(tool_spans) == 1 + + tool = tool_spans[0] + assert tool.name == "execute_tool get_weather" + attrs = dict(tool.attributes or {}) + assert attrs.get("gen_ai.span.kind") == "TOOL" + assert attrs.get("gen_ai.operation.name") == "execute_tool" + assert attrs.get("gen_ai.tool.name") == "get_weather" + assert attrs.get("gen_ai.tool.type") == "function" + assert ( + attrs.get("wildtool.tool.execution_mode") == "ground_truth_replay" + ) + + def test_tool_span_parent_is_chain( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """TOOL spans share the CHAIN trace_id (parent is STEP after Round 2).""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + chain_spans = [s for s in spans if s.name.startswith("workflow")] + tool_spans = [s for s in spans if "execute_tool" in s.name] + + assert len(chain_spans) == 1 + assert len(tool_spans) >= 1 + + chain = chain_spans[0] + for ts in tool_spans: + assert ts.context.trace_id == chain.context.trace_id + + +class TestSpanHierarchy: + def test_full_hierarchy( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """Verify ENTRY → AGENT → CHAIN → STEP hierarchy and consistent trace_id.""" + from wtb._llm_response_generation import multi_threaded_inference + + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + test_case = simple_test_entry.copy() + multi_threaded_inference(handler, "test-model", test_case) + + spans = span_exporter.get_finished_spans() + + entry = [s for s in spans if s.name == "enter_ai_application_system"] + agent = [s for s in spans if "invoke_agent" in s.name] + chain = [s for s in spans if s.name.startswith("workflow")] + step = [s for s in spans if s.name == "react step"] + tool = [s for s in spans if "execute_tool" in s.name] + + assert len(entry) == 1 + assert len(agent) == 1 + assert len(chain) == 1 + assert len(step) == 2 + assert len(tool) >= 1 + + trace_id = entry[0].context.trace_id + for s in spans: + assert s.context.trace_id == trace_id + + # AGENT parent = ENTRY + assert agent[0].parent.span_id == entry[0].context.span_id + # CHAIN parent = AGENT + assert chain[0].parent.span_id == agent[0].context.span_id + # STEP parent = CHAIN + for s in step: + assert s.parent.span_id == chain[0].context.span_id diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_entry_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_entry_span.py new file mode 100644 index 000000000..834e7dd13 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_entry_span.py @@ -0,0 +1,115 @@ +"""Tests for ENTRY span (P1: multi_threaded_inference). + +Module-level imports of ``wtb._llm_response_generation.multi_threaded_inference`` +must be avoided: ``wrapt.wrap_function_wrapper`` patches the attribute on the +module, but a pre-imported local binding still references the original +unwrapped function. All tests therefore import the symbol lazily after the +``instrument`` fixture has run. +""" + +import pytest +from opentelemetry.trace import StatusCode + +from wtb.model_handler.base_handler import BaseHandler + + +class _StubHandler(BaseHandler): + """Minimal handler subclass for testing. + + Overrides ``inference`` so the multi_threaded_inference wrapper invokes a + deterministic, side-effect-free body that returns a fake result dict and + therefore exercises only the ENTRY span codepath. + """ + + def __init__(self): + super().__init__("test-model", 0.0) + + def _request_tool_call(self, inference_data): + raise NotImplementedError + + def _parse_api_response(self, api_response): + raise NotImplementedError + + def inference(self, test_entry): + return [ + { + "action_name_label": "correct", + "is_optimal": True, + "inference_log": {}, + "latency": [0.1], + "input_token_count": [10], + "output_token_count": [5], + } + ] + + +class TestEntrySpan: + def test_entry_span_created(self, span_exporter, instrument): + """ENTRY span should be created with correct attributes.""" + from wtb._llm_response_generation import multi_threaded_inference + + handler = _StubHandler() + test_case = { + "id": "wild_tool_bench_test_001", + "english_tasks": ["task1", "task2"], + } + + result = multi_threaded_inference(handler, "gpt-4o", test_case) + + assert result is not None + assert result["id"] == "wild_tool_bench_test_001" + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans if s.name == "enter_ai_application_system" + ] + assert len(entry_spans) == 1 + + span = entry_spans[0] + attrs = dict(span.attributes or {}) + assert attrs.get("gen_ai.span.kind") == "ENTRY" + assert attrs.get("gen_ai.operation.name") == "enter" + assert attrs.get("gen_ai.framework") == "wildtool" + assert attrs.get("gen_ai.session.id") == "wild_tool_bench_test_001" + assert attrs.get("gen_ai.request.model") == "gpt-4o" + assert attrs.get("wildtool.turn_count") == 2 + # ENTRY spans rely on default OTel status semantics: success leaves + # the span UNSET, failures explicitly mark it ERROR. + assert span.status.status_code != StatusCode.ERROR + + def test_entry_span_error_path(self, span_exporter, instrument): + """The ENTRY wrapper marks the span ERROR when the wrapped callable + raises an unhandled exception. + + ``multi_threaded_inference`` swallows non-rate-limit errors itself + (see test_error_scenarios.test_entry_span_captures_retry_error_path + for that path). To exercise the wrapper's failure branch directly we + invoke the underlying ``WildToolEntryWrapper`` with a callable that + deliberately raises, bypassing ``multi_threaded_inference``'s own + error handling. + """ + from opentelemetry.instrumentation.wildtool._wrappers import ( + WildToolEntryWrapper, + ) + + wrapper = WildToolEntryWrapper(instrument._handler) + + def _raising(handler, model_name, test_case): + raise RuntimeError("API connection failed") + + handler = _StubHandler() + test_case = { + "id": "wild_tool_bench_test_002", + "english_tasks": ["task1"], + } + + with pytest.raises(RuntimeError, match="API connection failed"): + wrapper(_raising, None, (handler, "gpt-4o", test_case), {}) + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans if s.name == "enter_ai_application_system" + ] + assert len(entry_spans) == 1 + span = entry_spans[0] + assert span.status.status_code == StatusCode.ERROR diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_error_scenarios.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_error_scenarios.py new file mode 100644 index 000000000..c14a3f40c --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_error_scenarios.py @@ -0,0 +1,135 @@ +"""Tests for error/edge-case scenarios.""" + +import json + +import pytest +from opentelemetry.trace import StatusCode + +from wtb.model_handler.base_handler import BaseHandler + + +class _StubHandler(BaseHandler): + """Handler with controllable step responses.""" + + def __init__(self): + super().__init__("test-model", 0.0) + self._step_responses = [] + self._step_idx = 0 + + def _request_tool_call(self, inference_data): + resp = self._step_responses[self._step_idx] + self._step_idx += 1 + if isinstance(resp, Exception): + raise resp + return resp, 0.05 + + def _parse_api_response(self, api_response): + data = json.loads(api_response.json()) + choice = data["choices"][0] + message = choice["message"] + return { + "reasoning_content": None, + "content": message.get("content"), + "tool_calls": message.get("tool_calls"), + "input_token": data["usage"]["prompt_tokens"], + "output_token": data["usage"]["completion_tokens"], + } + + +class TestErrorScenarios: + def test_action_name_mismatch( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, + ): + """When model calls wrong tool, CHAIN span should still be OK with error label.""" + handler = _StubHandler() + # Model calls wrong_tool instead of get_weather + resp0 = tool_call_response_factory( + "wrong_tool", {"x": 1}, "call_bad" + ) + handler._step_responses = [resp0] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + chain_spans = [s for s in spans if s.name.startswith("workflow")] + assert len(chain_spans) == 1 + + chain = chain_spans[0] + attrs = dict(chain.attributes or {}) + assert attrs.get("wildtool.action_name_label") == "error" + assert chain.status.status_code == StatusCode.OK + + def test_empty_response( + self, span_exporter, instrument, simple_test_entry, + make_completion, + ): + """When model returns no content and no tool_calls, process terminates gracefully.""" + from tests.conftest import FakeChatCompletion, _make_chat_completion_response + + handler = _StubHandler() + resp = FakeChatCompletion( + _make_chat_completion_response(content="", tool_calls=None) + ) + handler._step_responses = [resp] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + chain_spans = [s for s in spans if s.name.startswith("workflow")] + assert len(chain_spans) == 1 + attrs = dict(chain_spans[0].attributes or {}) + assert attrs.get("wildtool.action_name_label") == "error" + + def test_request_tool_call_exception_sets_error( + self, span_exporter, instrument, simple_test_entry, + ): + """Exception in _request_tool_call should produce ERROR on STEP span and propagate.""" + handler = _StubHandler() + handler._step_responses = [RuntimeError("Connection timeout")] + + with pytest.raises(RuntimeError, match="Connection timeout"): + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + step_spans = [s for s in spans if s.name == "react step"] + assert len(step_spans) == 1 + assert step_spans[0].status.status_code == StatusCode.ERROR + + chain_spans = [s for s in spans if s.name.startswith("workflow")] + assert len(chain_spans) == 1 + assert chain_spans[0].status.status_code == StatusCode.ERROR + + def test_entry_span_captures_retry_error_path( + self, span_exporter, instrument, + ): + """multi_threaded_inference catches non-rate-limit errors and returns error dict. + ENTRY span should still complete successfully (not raise).""" + from wtb._llm_response_generation import multi_threaded_inference + + handler = _StubHandler() + + def failing_inference(test_entry): + raise ValueError("Invalid JSON from model") + + handler.inference = failing_inference + + test_case = { + "id": "wild_tool_bench_err_001", + "english_tasks": ["task1"], + } + + # multi_threaded_inference catches non-rate-limit errors + result = multi_threaded_inference(handler, "test-model", test_case) + assert "Error during inference" in result["result"] + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans if s.name == "enter_ai_application_system" + ] + assert len(entry_spans) == 1 + # multi_threaded_inference's own try/except converts the error into a + # normal return, so the ENTRY wrapper observes a successful call and + # leaves the span at the default UNSET status (definitely not ERROR). + span = entry_spans[0] + assert span.status.status_code != StatusCode.ERROR diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_instrumentor.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_instrumentor.py new file mode 100644 index 000000000..a8be5b4da --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_instrumentor.py @@ -0,0 +1,20 @@ +"""Tests for WildToolInstrumentor lifecycle.""" + +from opentelemetry.instrumentation.wildtool import WildToolInstrumentor + + +class TestWildToolInstrumentor: + def test_instrument_and_uninstrument(self, tracer_provider): + instrumentor = WildToolInstrumentor() + instrumentor.instrument( + tracer_provider=tracer_provider, + skip_dep_check=True, + ) + assert instrumentor._handler is not None + instrumentor.uninstrument() + assert instrumentor._handler is None + + def test_instrumentation_dependencies(self): + instrumentor = WildToolInstrumentor() + deps = instrumentor.instrumentation_dependencies() + assert ("openai >= 1.0.0",) == deps diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_round2_fixes.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_round2_fixes.py new file mode 100644 index 000000000..9f4f4d895 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_round2_fixes.py @@ -0,0 +1,441 @@ +"""Round 2 regression tests covering the H1 / H2 / M1 / M2 / M3 fixes. + +See ``llm-dev/execute.md`` § "修订记录 (Round 2 fix)" and +``example-deploy/validation/SUMMARY.md`` for the original validation gaps +addressed by these tests. +""" + +import json + +import pytest +from opentelemetry.trace import StatusCode + +from wtb.model_handler.base_handler import BaseHandler + + +class _StubHandler(BaseHandler): + """Minimal handler with controllable LLM responses (no real network).""" + + def __init__(self): + super().__init__("test-model", 0.0) + self._step_responses = [] + self._step_idx = 0 + + def _request_tool_call(self, inference_data): + resp = self._step_responses[self._step_idx] + self._step_idx += 1 + if isinstance(resp, Exception): + raise resp + return resp, 0.05 + + def _parse_api_response(self, api_response): + data = json.loads(api_response.json()) + choice = data["choices"][0] + message = choice["message"] + return { + "reasoning_content": None, + "content": message.get("content"), + "tool_calls": message.get("tool_calls"), + "input_token": data["usage"]["prompt_tokens"], + "output_token": data["usage"]["completion_tokens"], + } + + +def _spans_by_kind(spans, kind): + return [s for s in spans if (s.attributes or {}).get("gen_ai.span.kind") == kind] + + +def _spans_named(spans, name): + return [s for s in spans if s.name == name] + + +def _step_for_round(spans, round_num): + for s in _spans_named(spans, "react step"): + attrs = s.attributes or {} + if attrs.get("gen_ai.react.round") == round_num: + return s + raise AssertionError(f"no STEP span found for round={round_num}") + + +# ============================================================================ +# H1: TOOL span parent_span_id == STEP span_id (was CHAIN before fix) +# ============================================================================ + + +class TestToolParentIsStep: + def test_single_tool_parent_is_step_round_one( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """The single TOOL span in simple_test_entry should be a child of the + first STEP span (round=1), not the CHAIN span.""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + tool_spans = _spans_by_kind(spans, "TOOL") + assert len(tool_spans) == 1, [s.name for s in spans] + + tool = tool_spans[0] + step_round1 = _step_for_round(spans, 1) + chain = _spans_by_kind(spans, "CHAIN")[0] + + # H1 core assertion: parent is STEP, not CHAIN. + assert tool.parent is not None + assert tool.parent.span_id == step_round1.context.span_id, ( + "TOOL parent should be STEP round=1, got " + f"{tool.parent.span_id} (STEP={step_round1.context.span_id}, " + f"CHAIN={chain.context.span_id})" + ) + assert tool.parent.span_id != chain.context.span_id + + # And trace_id of course remains consistent. + assert tool.context.trace_id == step_round1.context.trace_id + + def test_multi_step_each_tool_parented_to_correct_step( + self, span_exporter, instrument, + tool_call_response_factory, text_response_factory, + ): + """multi-step scenario: 2 successful tool steps + 1 prepare_to_answer. + + Each TOOL span must be parented to the STEP span of its own round, + not to the CHAIN or to a different round's STEP. + """ + handler = _StubHandler() + # Test entry with 2 tool steps (search, lookup) then prepare_to_answer. + test_entry = { + "id": "wild_tool_bench_multi_001", + "english_env_info": "2025-01-01", + "english_tools": [ + { + "type": "function", + "function": { + "name": "search", + "description": "Search items", + "parameters": { + "type": "object", + "properties": {"q": {"type": "string"}}, + "required": ["q"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "lookup", + "description": "Look up details", + "parameters": { + "type": "object", + "properties": {"id": {"type": "string"}}, + "required": ["id"], + }, + }, + }, + ], + "english_tasks": ["Find and summarize item X"], + "english_answer_list": [ + [ + { + "action": {"name": "search", "arguments": {"q": "X"}}, + "observation": "found:item_42", + "dependency_list": [], + }, + { + "action": {"name": "lookup", "arguments": {"id": "item_42"}}, + "observation": "details:hello", + "dependency_list": [0], + }, + { + "action": {"name": "prepare_to_answer", "arguments": {}}, + "observation": "Item X is hello.", + "dependency_list": [1], + }, + ] + ], + } + + resp_step1 = tool_call_response_factory( + "search", {"q": "X"}, "call_search_1" + ) + resp_step2 = tool_call_response_factory( + "lookup", {"id": "item_42"}, "call_lookup_1" + ) + resp_step3 = text_response_factory("Item X is hello.") + handler._step_responses = [resp_step1, resp_step2, resp_step3] + + handler.inference_multi_turn(test_entry) + + spans = span_exporter.get_finished_spans() + tool_spans = sorted( + _spans_by_kind(spans, "TOOL"), + key=lambda s: (s.attributes or {}).get("gen_ai.tool.name") or "", + ) + assert len(tool_spans) == 2, [s.name for s in spans] + + step_round1 = _step_for_round(spans, 1) + step_round2 = _step_for_round(spans, 2) + chain = _spans_by_kind(spans, "CHAIN")[0] + + lookup_tool = next( + t for t in tool_spans + if (t.attributes or {}).get("gen_ai.tool.name") == "lookup" + ) + search_tool = next( + t for t in tool_spans + if (t.attributes or {}).get("gen_ai.tool.name") == "search" + ) + + # search → STEP round=1, lookup → STEP round=2 + assert search_tool.parent.span_id == step_round1.context.span_id + assert lookup_tool.parent.span_id == step_round2.context.span_id + # Neither parented on CHAIN (the regression we are fixing) + for t in tool_spans: + assert t.parent.span_id != chain.context.span_id + assert t.context.trace_id == chain.context.trace_id + + +# ============================================================================ +# M1: CHAIN span carries input.value and output.value +# ============================================================================ + + +class TestChainInputOutputValue: + def test_chain_input_value_and_output_value( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + chain_spans = _spans_by_kind(spans, "CHAIN") + assert len(chain_spans) == 1 + attrs = dict(chain_spans[0].attributes or {}) + + # input.value: last user message of the chain (prepared by wtb's + # _pre_messages_processing which appends the current task as user). + assert "input.value" in attrs, attrs + assert attrs["input.value"] == "What is the weather in Beijing?" + + # output.value: JSON containing action_name_label, task_idx, is_optimal. + assert "output.value" in attrs, attrs + out = json.loads(attrs["output.value"]) + assert out["action_name_label"] == "correct" + assert out["task_idx"] == 0 + assert out["is_optimal"] is True + + def test_chain_input_value_truncated_when_long( + self, span_exporter, instrument, + tool_call_response_factory, text_response_factory, + ): + """Very long user content should be truncated to keep span attribute small.""" + handler = _StubHandler() + long_text = "x" * 5000 + test_entry = { + "id": "wild_tool_bench_long_001", + "english_env_info": "2025-01-01", + "english_tools": [ + { + "type": "function", + "function": { + "name": "noop", + "description": "noop", + "parameters": {"type": "object", "properties": {}}, + }, + } + ], + "english_tasks": [long_text], + "english_answer_list": [ + [ + { + "action": {"name": "prepare_to_answer", "arguments": {}}, + "observation": "ok", + "dependency_list": [], + } + ] + ], + } + handler._step_responses = [text_response_factory("ok")] + + handler.inference_multi_turn(test_entry) + + spans = span_exporter.get_finished_spans() + chain = _spans_by_kind(spans, "CHAIN")[0] + attrs = dict(chain.attributes or {}) + assert "input.value" in attrs + # Default cap is 4096; truncated form must be <= cap + suffix length. + assert len(attrs["input.value"]) <= 4096 + len("...(truncated)") + assert attrs["input.value"].startswith("xxx") + + +# ============================================================================ +# M2: STEP span carries gen_ai.react.finish_reason on error paths +# ============================================================================ + + +class TestStepFinishReason: + def test_finish_reason_action_name_mismatch( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, + ): + handler = _StubHandler() + # wrong tool name → wtb's "action name not in candidate" branch + handler._step_responses = [ + tool_call_response_factory("wrong_tool", {"x": 1}, "call_bad") + ] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + steps = _spans_named(spans, "react step") + assert len(steps) == 1 + attrs = dict(steps[0].attributes or {}) + assert attrs.get("gen_ai.react.finish_reason") == "action_name_mismatch" + + def test_finish_reason_empty_response( + self, span_exporter, instrument, simple_test_entry, make_completion, + ): + """Empty content + no tool_calls → STEP gets finish_reason=empty_response.""" + from tests.conftest import ( + FakeChatCompletion, + _make_chat_completion_response, + ) + + handler = _StubHandler() + handler._step_responses = [ + FakeChatCompletion( + _make_chat_completion_response(content="", tool_calls=None) + ) + ] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + steps = _spans_named(spans, "react step") + assert len(steps) == 1 + attrs = dict(steps[0].attributes or {}) + assert attrs.get("gen_ai.react.finish_reason") == "empty_response" + + def test_finish_reason_request_exception( + self, span_exporter, instrument, simple_test_entry, + ): + """Exception in _request_tool_call → STEP ERROR + finish_reason=error.""" + handler = _StubHandler() + handler._step_responses = [RuntimeError("Boom")] + + with pytest.raises(RuntimeError): + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + steps = _spans_named(spans, "react step") + assert len(steps) == 1 + attrs = dict(steps[0].attributes or {}) + assert steps[0].status.status_code == StatusCode.ERROR + assert attrs.get("gen_ai.react.finish_reason") == "error" + + def test_finish_reason_omitted_on_success( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """Successful steps should NOT have a finish_reason (per execute.md).""" + handler = _StubHandler() + handler._step_responses = [ + tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ), + text_response_factory("OK"), + ] + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + for s in _spans_named(spans, "react step"): + attrs = dict(s.attributes or {}) + assert "gen_ai.react.finish_reason" not in attrs, ( + f"unexpected finish_reason on success step round=" + f"{attrs.get('gen_ai.react.round')}: {attrs.get('gen_ai.react.finish_reason')}" + ) + + +# ============================================================================ +# M3: TOOL span carries gen_ai.tool.call.arguments / result / description +# (and keeps wildtool.tool.execution_mode) +# ============================================================================ + + +class TestToolSensitiveAttributes: + def test_tool_args_result_description_and_execution_mode( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("Sunny day") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + tool_spans = _spans_by_kind(spans, "TOOL") + assert len(tool_spans) == 1 + attrs = dict(tool_spans[0].attributes or {}) + + # M3 explicit attrs. + args_attr = attrs.get("gen_ai.tool.call.arguments") + assert args_attr is not None + assert json.loads(args_attr) == {"city": "Beijing"} + + # observation comes from the appended {"role": "tool", ...} message + # written by wtb after the call matches the answer; it's a string. + result_attr = attrs.get("gen_ai.tool.call.result") + assert result_attr == "Sunny, 25°C", attrs + + # description sourced from inference_data["tools"][i].function.description + assert attrs.get("gen_ai.tool.description") == "Get weather for a city" + + # Existing custom attribute must still be present. + assert ( + attrs.get("wildtool.tool.execution_mode") + == "ground_truth_replay" + ) + + +# ============================================================================ +# H2: STEP span carries gen_ai.system / gen_ai.provider.name fallback +# ============================================================================ + + +class TestStepProviderFallback: + def test_step_has_provider_name_fallback( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + handler = _StubHandler() + handler._step_responses = [ + tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ), + text_response_factory("OK"), + ] + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + steps = _spans_named(spans, "react step") + assert len(steps) == 2 + for s in steps: + attrs = dict(s.attributes or {}) + assert attrs.get("gen_ai.system") == "openai", attrs + assert attrs.get("gen_ai.provider.name") == "openai", attrs diff --git a/packages.txt b/packages.txt new file mode 100644 index 000000000..cee224898 --- /dev/null +++ b/packages.txt @@ -0,0 +1,112 @@ +aiohappyeyeballs==2.6.1 +aiohttp==3.10.2 +aiosignal==1.3.1 +aliyun-instrumentation-sglang @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/aliyun-instrumentation-sglang +aliyun-instrumentation-vllm @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/aliyun-instrumentation-vllm +-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=aliyun_sdk_extension_arms&subdirectory=sdk-extension/aliyun-sdk-extension-arms +aliyun-semantic-conventions==1.2.0 +annotated-types==0.7.0 +anyio==4.10.0 +asgiref==3.8.1 +asttokens==3.0.0 +async-timeout==4.0.3 +attrs==25.3.0 +blinker==1.7.0 +build==1.3.0 +bytecode==0.17.0 +certifi==2024.7.4 +chardet==5.2.0 +charset-normalizer==3.3.2 +click==8.1.7 +cramjam==2.10.0 +crcmod==1.7 +decorator==5.2.1 +Deprecated==1.2.14 +Django==5.2.4 +executing==2.2.1 +fastapi==0.116.1 +filelock==3.19.1 +Flask==3.0.2 +frozenlist==1.4.1 +fsspec==2025.9.0 +googleapis-common-protos==1.70.0 +h11==0.16.0 +http_server_mock==1.7 +httpcore==1.0.9 +httpretty==1.1.4 +httpx==0.28.1 +idna==3.7 +importlib_metadata==8.4.0 +iniconfig==2.0.0 +ipython==9.5.0 +ipython_pygments_lexers==1.1.1 +itsdangerous==2.1.2 +jedi==0.19.2 +Jinja2==3.1.4 +jsonpath==0.82.2 +MarkupSafe==2.1.5 +matplotlib-inline==0.1.7 +mpmath==1.3.0 +multidict==6.0.5 +networkx==3.5 +numpy==2.3.2 +opentelemetry-api==1.30.0 +-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_exporter_otlp_proto_http&subdirectory=exporter/opentelemetry-exporter-otlp-proto-http +-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_instrumentation&subdirectory=opentelemetry-instrumentation +-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_instrumentation_aiohttp_client&subdirectory=instrumentation/opentelemetry-instrumentation-aiohttp-client +opentelemetry-instrumentation-asgi @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/opentelemetry-instrumentation-asgi +-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_instrumentation_django&subdirectory=instrumentation/opentelemetry-instrumentation-django +opentelemetry-instrumentation-fastapi @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/opentelemetry-instrumentation-fastapi +opentelemetry-instrumentation-flask @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/opentelemetry-instrumentation-flask +opentelemetry-instrumentation-httpx @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/opentelemetry-instrumentation-httpx +opentelemetry-instrumentation-requests @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/opentelemetry-instrumentation-requests +opentelemetry-instrumentation-tornado @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/opentelemetry-instrumentation-tornado +opentelemetry-instrumentation-wsgi==0.51b0 +-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_sdk&subdirectory=opentelemetry-sdk +-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_semantic_conventions&subdirectory=opentelemetry-semantic-conventions +-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_test_utils&subdirectory=opentelemetry-test-utils +-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_util_http&subdirectory=util/opentelemetry-util-http +packaging==24.0 +parso==0.8.5 +pexpect==4.9.0 +pillow==11.3.0 +pluggy==1.5.0 +prompt_toolkit==3.0.52 +propcache==0.3.2 +protobuf==6.32.0 +psutil==7.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +py-cpuinfo==9.0.0 +pydantic==2.11.7 +pydantic_core==2.33.2 +Pygments==2.19.2 +pyproject_hooks==1.2.0 +pytest==7.4.4 +python-snappy==0.7.3 +PyYAML==6.0.2 +requests==2.32.3 +setproctitle==1.3.6 +setuptools==80.9.0 +sglang==0.4.8 +sniffio==1.3.1 +sqlparse==0.5.3 +stack-data==0.6.3 +starlette==0.47.2 +sympy==1.14.0 +tomli==2.0.1 +tomlkit==0.13.3 +torch==2.8.0 +tornado==6.5.2 +tqdm==4.67.1 +traitlets==5.14.3 +typing-inspection==0.4.1 +typing_extensions==4.12.2 +urllib3==2.2.2 +uvloop==0.21.0 +wcwidth==0.2.13 +Werkzeug==3.0.6 +wheel==0.45.1 +wrapt==1.16.0 +yarl==1.9.4 +zipp==3.19.2