diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/CHANGELOG.md b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/CHANGELOG.md new file mode 100644 index 000000000..62fb6539b --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/CHANGELOG.md @@ -0,0 +1,22 @@ +# Changelog + +All notable changes to the LoongSuite BFCL v4 instrumentation are documented +in this file. + +## Unreleased + +### Added + +- Initial release of `loongsuite-instrumentation-bfclv4`. +- ENTRY span around `bfcl_eval._llm_response_generation.generate_results`. +- AGENT span around `bfcl_eval.model_handler.base_handler.BaseHandler.inference` + with cross-thread OTel context propagation via a narrow patch of + `bfcl_eval._llm_response_generation.ThreadPoolExecutor`. +- STEP spans created by reflectively wrapping each handler's + `_query_FC` / `_query_prompting` (discovered via + `bfcl_eval.constants.model_config.MODEL_CONFIG_MAPPING`). +- Per-call TOOL spans emitted by wrapping + `bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils.execute_multi_turn_func_call`. +- Provider override mapping for OSS handlers (vLLM / SGLang). +- Multi-turn `bfcl.turn_idx` and ReAct `gen_ai.react.round` tracking via + `contextvars`. diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/README.md b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/README.md new file mode 100644 index 000000000..7a4e5d69d --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/README.md @@ -0,0 +1,79 @@ +# LoongSuite BFCL v4 Instrumentation + +LoongSuite Python instrumentation for the [Berkeley Function Call +Leaderboard v4](https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard) +(`bfcl-eval`, package `bfcl_eval`). + +## Span Topology + +``` +ENTRY enter_ai_application_system gen_ai.span.kind=ENTRY, op=enter +└─ AGENT invoke_agent {test_entry_id} gen_ai.span.kind=AGENT, op=invoke_agent + ├─ STEP react step gen_ai.span.kind=STEP, op=react + │ ├─ LLM chat {model} (created by downstream vendor SDK probe) + │ └─ TOOL execute_tool {fn} gen_ai.span.kind=TOOL, op=execute_tool + └─ STEP react step + └─ ... +``` + +This instrumentation deliberately does **not** create LLM spans. They are +emitted by the downstream vendor SDK probe (OpenAI / Anthropic / Google / +DashScope / LiteLLM / etc.) so that token usage and request payloads stay in +sync with the SDK that actually performed the request. + +## Installation + +```bash +pip install loongsuite-instrumentation-bfclv4 +``` + +## Usage + +```bash +opentelemetry-instrument bfcl generate \ + --model gpt-4o-2024-11-20-FC \ + --test-category simple_python \ + --num-threads 2 +``` + +Or programmatically: + +```python +from opentelemetry.instrumentation.bfclv4 import BFCLv4Instrumentor + +BFCLv4Instrumentor().instrument() +# ... run BFCL ... +BFCLv4Instrumentor().uninstrument() +``` + +## Compatibility With Downstream LLM SDK Probes + +| Scenario | Recommended downstream probe | +| --- | --- | +| OpenAI / OpenAI Responses / OSS via vLLM / SGLang / DeepSeek (OpenAI-compatible) | `opentelemetry-instrumentation-openai` | +| Anthropic / Claude | `loongsuite-instrumentation-claude-agent-sdk` | +| Gemini / Google | `loongsuite-instrumentation-google-adk` | +| Qwen / DashScope | `loongsuite-instrumentation-dashscope` | +| LiteLLM | `loongsuite-instrumentation-litellm` | + +## OSS Provider Notes + +For OSS handlers (vLLM / SGLang served via the OpenAI-compatible API), the +BFCL probe sets `gen_ai.provider.name` to `vllm` / `sglang` / `oss` and adds +`bfcl.oss.backend` for disambiguation. Downstream OpenAI probes will still +report `gen_ai.provider.name=openai` on the LLM span; this is expected. + +## Custom Attributes + +| Attribute | Where | Description | +| --- | --- | --- | +| `gen_ai.framework` = `bfclv4` | ENTRY/AGENT/STEP/TOOL | Framework tag | +| `bfcl.test_category` | ENTRY/AGENT | Test category | +| `bfcl.num_threads` | ENTRY | Configured thread pool size | +| `bfcl.test_case_count` | ENTRY | Number of test cases | +| `bfcl.run_ids` | ENTRY | Whether the run targeted specific IDs | +| `bfcl.test_entry_id` | AGENT | Test entry id | +| `bfcl.turn_idx` | STEP | Multi-turn turn index (0-based) | +| `bfcl.query_mode` | STEP | `FC` or `prompting` | +| `bfcl.oss.backend` | AGENT/STEP | `vllm` / `sglang` / `unknown` (only OSS) | +| `bfcl.tool.duration_is_estimated` | TOOL | True (latency is averaged across batch) | diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/pyproject.toml new file mode 100644 index 000000000..3eeb5d026 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/pyproject.toml @@ -0,0 +1,54 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "loongsuite-instrumentation-bfclv4" +dynamic = ["version"] +description = "LoongSuite BFCL v4 (Berkeley Function Call Leaderboard) instrumentation" +readme = "README.md" +license = "Apache-2.0" +requires-python = ">=3.10,<4" +authors = [ + { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +dependencies = [ + "opentelemetry-api >= 1.37.0", + "opentelemetry-instrumentation >= 0.58b0", + "opentelemetry-semantic-conventions >= 0.58b0", + "wrapt >= 1.0.0, < 2.0.0", + "opentelemetry-util-genai >= 0.3b0.dev0", +] + +[project.optional-dependencies] +instruments = [ + "bfcl-eval >= 4.0.0", +] + +[project.entry-points.opentelemetry_instrumentor] +bfclv4 = "opentelemetry.instrumentation.bfclv4:BFCLv4Instrumentor" + +[project.urls] +Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4" +Repository = "https://github.com/alibaba/loongsuite-python-agent" + +[tool.hatch.version] +path = "src/opentelemetry/instrumentation/bfclv4/version.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/opentelemetry"] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/__init__.py new file mode 100644 index 000000000..6a7729940 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/__init__.py @@ -0,0 +1,322 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""LoongSuite BFCL v4 (Berkeley Function Call Leaderboard) instrumentation. + +Usage +----- + +.. code:: python + + from opentelemetry.instrumentation.bfclv4 import BFCLv4Instrumentor + + BFCLv4Instrumentor().instrument() + # ... run BFCL ... + BFCLv4Instrumentor().uninstrument() + +API +--- +""" + +from __future__ import annotations + +import importlib +import logging +from typing import Any, Collection, List, Tuple + +from wrapt import wrap_function_wrapper + +from opentelemetry.instrumentation.bfclv4.internal.wrappers import ( + BaseHandlerInferenceWrapper, + ExecuteFuncCallWrapper, + GenerateResultsWrapper, + QueryWrapper, + TurnBumpWrapper, +) +from opentelemetry.instrumentation.bfclv4.package import _instruments +from opentelemetry.instrumentation.bfclv4.utils import GenAIHookHelper +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor +from opentelemetry.instrumentation.utils import unwrap + +logger = logging.getLogger(__name__) + +__all__ = ["BFCLv4Instrumentor"] + + +_GENERATE_RESULTS_MODULE = "bfcl_eval._llm_response_generation" +_GENERATE_RESULTS_NAME = "generate_results" + +_BASE_HANDLER_MODULE = "bfcl_eval.model_handler.base_handler" +_BASE_HANDLER_NAME = "BaseHandler.inference" + +_EXECUTE_TOOL_MODULE = ( + "bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils" +) +_EXECUTE_TOOL_NAME = "execute_multi_turn_func_call" + + +# ``MODEL_CONFIG_MAPPING`` already imports every concrete handler at module +# load time, so iterating over its values gives us the canonical handler +# class set without risking new vendor SDK imports. +def _iter_handler_classes() -> List[type]: + try: + from bfcl_eval.constants.model_config import ( # noqa: PLC0415 + MODEL_CONFIG_MAPPING, + ) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: cannot import MODEL_CONFIG_MAPPING: %s", exc + ) + return [] + + classes: List[type] = [] + seen_class_ids: set[int] = set() + for cfg in MODEL_CONFIG_MAPPING.values(): + cls = getattr(cfg, "model_handler", None) + if cls is None or not isinstance(cls, type): + continue + if id(cls) in seen_class_ids: + continue + seen_class_ids.add(id(cls)) + classes.append(cls) + return classes + + +class BFCLv4Instrumentor(BaseInstrumentor): + """An instrumentor for the BFCL v4 (``bfcl_eval``) framework.""" + + def __init__(self) -> None: + super().__init__() + if not hasattr(self, "_wrapped_query_methods"): + self._wrapped_query_methods: List[Tuple[type, str]] = [] + if not hasattr(self, "_wrapped_turn_methods"): + self._wrapped_turn_methods: List[Tuple[type, str]] = [] + if not hasattr(self, "_entry_wrapped"): + self._entry_wrapped = False + if not hasattr(self, "_inference_wrapped"): + self._inference_wrapped = False + if not hasattr(self, "_tool_wrapped"): + self._tool_wrapped = False + if not hasattr(self, "_tool_targets"): + self._tool_targets: List[Tuple[str, str]] = [] + + def instrumentation_dependencies(self) -> Collection[str]: + return _instruments + + # ------------------------------------------------------------------ + # _instrument + + def _instrument(self, **kwargs: Any) -> None: # noqa: D401 + helper = GenAIHookHelper() + + # 1) ENTRY ----------------------------------------------------- + try: + wrap_function_wrapper( + _GENERATE_RESULTS_MODULE, + _GENERATE_RESULTS_NAME, + GenerateResultsWrapper(helper), + ) + self._entry_wrapped = True + except Exception as exc: # noqa: BLE001 + logger.warning( + "bfclv4: failed to wrap %s.%s: %s", + _GENERATE_RESULTS_MODULE, + _GENERATE_RESULTS_NAME, + exc, + ) + + # 2) AGENT ----------------------------------------------------- + try: + wrap_function_wrapper( + _BASE_HANDLER_MODULE, + _BASE_HANDLER_NAME, + BaseHandlerInferenceWrapper(helper), + ) + self._inference_wrapped = True + except Exception as exc: # noqa: BLE001 + logger.warning( + "bfclv4: failed to wrap %s.%s: %s", + _BASE_HANDLER_MODULE, + _BASE_HANDLER_NAME, + exc, + ) + + # 3) STEP + 4) turn maintenance -------------------------------- + self._instrument_handlers(helper) + + # 5) TOOL ------------------------------------------------------ + # ``execute_multi_turn_func_call`` is re-exported via ``from ... import`` + # in several BFCL modules, so wrapping just the source module misses + # the call sites that use the local binding. We wrap each known + # re-export site as well to guarantee the TOOL span is always emitted. + tool_targets = [ + (_EXECUTE_TOOL_MODULE, _EXECUTE_TOOL_NAME), + ( + "bfcl_eval.model_handler.base_handler", + _EXECUTE_TOOL_NAME, + ), + ( + "bfcl_eval.eval_checker.multi_turn_eval.multi_turn_checker", + _EXECUTE_TOOL_NAME, + ), + ] + wrapper_instance = ExecuteFuncCallWrapper(helper) + self._tool_targets = [] + for module_name, attr_name in tool_targets: + try: + wrap_function_wrapper( + module_name, + attr_name, + wrapper_instance, + ) + self._tool_targets.append((module_name, attr_name)) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to wrap %s.%s: %s", + module_name, + attr_name, + exc, + ) + self._tool_wrapped = bool(self._tool_targets) + + def _instrument_handlers(self, helper: GenAIHookHelper) -> None: + # Reflectively wrap every concrete ``_query_FC`` / ``_query_prompting`` + # plus the turn-maintenance helpers; we de-duplicate by function id so + # subclasses that share an inherited implementation are wrapped only + # once. + seen_func_ids: set[int] = set() + + query_pairs = ( + ("_query_FC", "FC"), + ("_query_prompting", "prompting"), + ) + turn_pairs = ( + ("add_first_turn_message_FC", True), + ("add_first_turn_message_prompting", True), + ("_add_next_turn_user_message_FC", False), + ("_add_next_turn_user_message_prompting", False), + ) + + for cls in _iter_handler_classes(): + class_dict = getattr(cls, "__dict__", {}) + for method_name, mode in query_pairs: + method = class_dict.get(method_name) + if method is None or not callable(method): + continue + key = id(method) + if key in seen_func_ids: + continue + seen_func_ids.add(key) + try: + wrap_function_wrapper( + cls.__module__, + f"{cls.__name__}.{method_name}", + QueryWrapper(helper, mode), + ) + self._wrapped_query_methods.append((cls, method_name)) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to wrap %s.%s.%s: %s", + cls.__module__, + cls.__name__, + method_name, + exc, + ) + + for method_name, is_first in turn_pairs: + method = class_dict.get(method_name) + if method is None or not callable(method): + continue + key = id(method) + if key in seen_func_ids: + continue + seen_func_ids.add(key) + try: + wrap_function_wrapper( + cls.__module__, + f"{cls.__name__}.{method_name}", + TurnBumpWrapper(reset=is_first), + ) + self._wrapped_turn_methods.append((cls, method_name)) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to wrap %s.%s.%s: %s", + cls.__module__, + cls.__name__, + method_name, + exc, + ) + + # ------------------------------------------------------------------ + # _uninstrument + + def _uninstrument(self, **kwargs: Any) -> None: # noqa: D401 + if self._tool_wrapped: + for module_name, attr_name in getattr(self, "_tool_targets", []): + try: + module = importlib.import_module(module_name) + unwrap(module, attr_name) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to unwrap %s.%s: %s", + module_name, + attr_name, + exc, + ) + self._tool_targets = [] + self._tool_wrapped = False + + for cls, method_name in self._wrapped_query_methods: + try: + unwrap(cls, method_name) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to unwrap %s.%s: %s", + cls.__name__, + method_name, + exc, + ) + self._wrapped_query_methods = [] + + for cls, method_name in self._wrapped_turn_methods: + try: + unwrap(cls, method_name) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to unwrap %s.%s: %s", + cls.__name__, + method_name, + exc, + ) + self._wrapped_turn_methods = [] + + if self._inference_wrapped: + try: + base_module = importlib.import_module(_BASE_HANDLER_MODULE) + unwrap(base_module.BaseHandler, "inference") + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to unwrap BaseHandler.inference: %s", exc + ) + self._inference_wrapped = False + + if self._entry_wrapped: + try: + module = importlib.import_module(_GENERATE_RESULTS_MODULE) + unwrap(module, _GENERATE_RESULTS_NAME) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to unwrap generate_results: %s", exc + ) + self._entry_wrapped = False diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/__init__.py new file mode 100644 index 000000000..b0a6f4284 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/__init__.py @@ -0,0 +1,13 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/attributes.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/attributes.py new file mode 100644 index 000000000..774200aba --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/attributes.py @@ -0,0 +1,38 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Constant attribute keys used by the BFCL v4 instrumentation.""" + +from __future__ import annotations + +from typing import Final + +FRAMEWORK_NAME: Final = "bfclv4" + +# gen_ai.* attribute keys that are not exported by +# opentelemetry-semantic-conventions today. +GEN_AI_FRAMEWORK: Final = "gen_ai.framework" +GEN_AI_PROVIDER_NAME: Final = "gen_ai.provider.name" + +# BFCL-specific (vendor) attribute keys. +BFCL_TEST_CATEGORY: Final = "bfcl.test_category" +BFCL_NUM_THREADS: Final = "bfcl.num_threads" +BFCL_TEST_CASE_COUNT: Final = "bfcl.test_case_count" +BFCL_RUN_IDS: Final = "bfcl.run_ids" +BFCL_TEST_ENTRY_ID: Final = "bfcl.test_entry_id" +BFCL_TURN_IDX: Final = "bfcl.turn_idx" +BFCL_QUERY_MODE: Final = "bfcl.query_mode" +BFCL_OSS_BACKEND: Final = "bfcl.oss.backend" +BFCL_TOOL_DURATION_IS_ESTIMATED: Final = "bfcl.tool.duration_is_estimated" +BFCL_TOOL_INDEX: Final = "bfcl.tool.index" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/provider.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/provider.py new file mode 100644 index 000000000..efa2c77dc --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/provider.py @@ -0,0 +1,71 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Map BFCL ``ModelStyle`` enum values to ``gen_ai.provider.name``.""" + +from __future__ import annotations + +import os +from typing import Any, Dict, Tuple + +from opentelemetry.instrumentation.bfclv4.internal.attributes import ( + BFCL_OSS_BACKEND, +) + +# The BFCL backend name (vllm / sglang / ...) is communicated from the ENTRY +# wrapper to the per-thread STEP/AGENT wrappers via this env var. The ENTRY +# wrapper writes to it before invoking the wrapped function and clears it in +# the ``finally`` clause. +OSS_BACKEND_ENV = "BFCL_BACKEND" + + +def infer_provider(handler: Any) -> Tuple[str, Dict[str, Any]]: + """Return ``(provider_name, extra_attributes)`` for a BFCL handler. + + Falls back to ``"unknown"`` if BFCL is not importable or if the handler + has no ``model_style`` attribute. + """ + + try: + from bfcl_eval.constants.enums import ( # noqa: PLC0415 + ModelStyle, + ) + except ImportError: + return "unknown", {} + + style = getattr(handler, "model_style", None) + if style is None: + return "unknown", {} + + if style is ModelStyle.OSSMODEL: + backend = (os.getenv(OSS_BACKEND_ENV) or "").lower() + if backend in ("vllm", "sglang"): + return backend, {BFCL_OSS_BACKEND: backend} + return "oss", {BFCL_OSS_BACKEND: "unknown"} + + mapping = { + ModelStyle.OPENAI_COMPLETIONS: "openai", + ModelStyle.OPENAI_RESPONSES: "openai", + ModelStyle.ANTHROPIC: "anthropic", + ModelStyle.GOOGLE: "gcp.gemini", + ModelStyle.MISTRAL: "mistral_ai", + ModelStyle.COHERE: "cohere", + ModelStyle.AMAZON: "aws.bedrock", + ModelStyle.FIREWORK_AI: "fireworks_ai", + ModelStyle.WRITER: "writer", + ModelStyle.NOVITA_AI: "novita", + ModelStyle.NEXUS: "nexusflow", + ModelStyle.GORILLA: "gorilla", + } + return mapping.get(style, "unknown"), {} diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/state.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/state.py new file mode 100644 index 000000000..ae4861035 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/state.py @@ -0,0 +1,93 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Per-thread ReAct state for the BFCL v4 instrumentation. + +We use ``contextvars.ContextVar`` so that each worker thread spawned by the +BFCL ``ThreadPoolExecutor`` gets its own copy. ``_ContextPropagatingExecutor`` +in :mod:`threading_propagation` makes sure ENTRY-time context is copied into +the worker thread; the BaseHandler.inference wrapper then initializes a fresh +state on top of that copy. +""" + +from __future__ import annotations + +import contextvars +from typing import Any, Dict, Optional + +_REACT_STATE: contextvars.ContextVar[Optional[Dict[str, Any]]] = ( + contextvars.ContextVar("bfclv4_react_state", default=None) +) + + +def init_state() -> contextvars.Token: + """Initialise per-AGENT state and return the reset token.""" + state: Dict[str, Any] = { + # ``turn_idx`` is incremented by the wrapper around + # ``_add_next_turn_user_message_*``; it stays ``0`` for single-turn + # tests. + "turn_idx": 0, + # ``fc_round`` is the ReAct round counter. We bump it on every STEP + # entry so the first STEP within a turn ends up with ``round=1``. + "fc_round": 0, + # Counter of executed tool calls within the current AGENT - useful for + # the TOOL span ``tool_call_id`` synthesis. + "tool_index": 0, + } + return _REACT_STATE.set(state) + + +def reset_state(token: contextvars.Token) -> None: + try: + _REACT_STATE.reset(token) + except (LookupError, ValueError): + # Token may have already been reset (e.g. nested error path). + pass + + +def get_state() -> Optional[Dict[str, Any]]: + return _REACT_STATE.get() + + +def bump_round() -> int: + state = _REACT_STATE.get() + if state is None: + return 1 + state["fc_round"] = state.get("fc_round", 0) + 1 + return state["fc_round"] + + +def reset_round_for_turn() -> None: + state = _REACT_STATE.get() + if state is None: + return + state["fc_round"] = 0 + + +def bump_turn() -> int: + state = _REACT_STATE.get() + if state is None: + return 0 + state["turn_idx"] = state.get("turn_idx", 0) + 1 + state["fc_round"] = 0 + return state["turn_idx"] + + +def next_tool_index() -> int: + state = _REACT_STATE.get() + if state is None: + return 0 + idx = state.get("tool_index", 0) + state["tool_index"] = idx + 1 + return idx diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/threading_propagation.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/threading_propagation.py new file mode 100644 index 000000000..d19c05799 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/threading_propagation.py @@ -0,0 +1,43 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Context-propagating ``ThreadPoolExecutor`` used by the ENTRY wrapper. + +``concurrent.futures.ThreadPoolExecutor`` does not automatically copy the +current ``contextvars`` context (which holds the OTel current span) into +worker threads. We subclass it and copy ``contextvars.copy_context()`` per +``submit`` so the AGENT span created inside the worker thread can attach as +a child of the ENTRY span. + +We only swap the ``ThreadPoolExecutor`` *name* in the +``bfcl_eval._llm_response_generation`` namespace; the global +``concurrent.futures.ThreadPoolExecutor`` is untouched. +""" + +from __future__ import annotations + +import contextvars +from concurrent.futures import ThreadPoolExecutor as _RealExecutor + + +class ContextPropagatingExecutor(_RealExecutor): + """``ThreadPoolExecutor`` that propagates the calling ``Context``. + + Only the ``submit`` method is overridden because BFCL only uses + ``submit`` (see ``_llm_response_generation.generate_results``). + """ + + def submit(self, fn, /, *args, **kwargs): # type: ignore[override] + ctx = contextvars.copy_context() + return super().submit(ctx.run, fn, *args, **kwargs) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py new file mode 100644 index 000000000..cb9bd3f34 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py @@ -0,0 +1,694 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Wrapper classes for the BFCL v4 instrumentation. + +Each wrapper follows the standard ``wrapt`` callable contract:: + + def __call__(self, wrapped, instance, args, kwargs): + ... + +All wrappers rely on :func:`get_extended_telemetry_handler` (LoongSuite +``util-genai``) to create the actual spans, so that ENTRY / AGENT / STEP / +TOOL spans get the canonical ``gen_ai.span.kind`` and operation-name values +that the LoongSuite semantic-validator expects. +""" + +from __future__ import annotations + +import logging +import os +import time +from typing import Any, Callable, Iterable, List, Optional + +from opentelemetry.instrumentation.bfclv4.internal.attributes import ( + BFCL_NUM_THREADS, + BFCL_OSS_BACKEND, + BFCL_QUERY_MODE, + BFCL_RUN_IDS, + BFCL_TEST_CASE_COUNT, + BFCL_TEST_CATEGORY, + BFCL_TEST_ENTRY_ID, + BFCL_TOOL_DURATION_IS_ESTIMATED, + BFCL_TOOL_INDEX, + BFCL_TURN_IDX, + FRAMEWORK_NAME, + GEN_AI_FRAMEWORK, + GEN_AI_PROVIDER_NAME, +) +from opentelemetry.instrumentation.bfclv4.internal.provider import ( + OSS_BACKEND_ENV, + infer_provider, +) +from opentelemetry.instrumentation.bfclv4.internal.state import ( + bump_round, + bump_turn, + init_state, + next_tool_index, + reset_state, +) +from opentelemetry.instrumentation.bfclv4.internal.threading_propagation import ( + ContextPropagatingExecutor, +) +from opentelemetry.instrumentation.bfclv4.utils import ( + GenAIHookHelper, + to_text_input, + to_text_output, + truncate_text, +) +from opentelemetry.util.genai.extended_handler import ( + get_extended_telemetry_handler, +) +from opentelemetry.util.genai.extended_types import ( + EntryInvocation, + ExecuteToolInvocation, + InvokeAgentInvocation, + ReactStepInvocation, +) + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Helpers + + +def _safe_get(obj: Any, key: str, default: Any = None) -> Any: + if isinstance(obj, dict): + return obj.get(key, default) + return getattr(obj, key, default) + + +def _flatten_tokens(value: Any) -> Optional[int]: + """Sum a possibly nested ``int|float|list|list[list]`` BFCL token field.""" + if value is None: + return None + if isinstance(value, (int, float)): + return int(value) + if isinstance(value, Iterable): + total = 0 + any_seen = False + for item in value: + sub = _flatten_tokens(item) + if sub is not None: + total += sub + any_seen = True + if any_seen: + return total + return None + + +def _test_category_from_id(test_entry_id: Optional[str]) -> Optional[str]: + if not test_entry_id or "_" not in test_entry_id: + return None + return test_entry_id.rsplit("_", 1)[0] + + +def _join_test_category(value: Any) -> Optional[str]: + if value is None: + return None + if isinstance(value, str): + return value + if isinstance(value, (list, tuple, set)): + joined = ",".join(str(v) for v in value if v is not None) + return joined or None + return str(value) + + +# --------------------------------------------------------------------------- +# ENTRY wrapper + + +class GenerateResultsWrapper: + """Wraps ``bfcl_eval._llm_response_generation.generate_results``. + + Responsibilities: + + * Open the ENTRY span (``enter_ai_application_system``). + * Temporarily swap the ``ThreadPoolExecutor`` reference inside the BFCL + generation module to a context-propagating subclass so that AGENT spans + created in worker threads inherit the ENTRY span as parent. + * Publish ``args.backend`` to ``BFCL_BACKEND`` so that + :func:`infer_provider` can attribute OSS spans to vllm / sglang. + """ + + def __init__(self, helper: GenAIHookHelper) -> None: + self._helper = helper + + def __call__(self, wrapped: Callable, instance: Any, args, kwargs): # noqa: D401 + # ``generate_results(args, model_name, test_cases_total)`` + cli_args = args[0] if len(args) >= 1 else kwargs.get("args") + model_name = args[1] if len(args) >= 2 else kwargs.get("model_name") + test_cases_total = ( + args[2] if len(args) >= 3 else kwargs.get("test_cases_total") + ) + + try: + from bfcl_eval import ( # noqa: PLC0415 + _llm_response_generation as _bfcl_gen, + ) + except ImportError: + return wrapped(*args, **kwargs) + + original_executor = getattr(_bfcl_gen, "ThreadPoolExecutor", None) + if original_executor is not None: + _bfcl_gen.ThreadPoolExecutor = ContextPropagatingExecutor + + backend_value = ( + _safe_get(cli_args, "backend", None) if cli_args is not None else None + ) + previous_backend_env = os.environ.get(OSS_BACKEND_ENV) + if backend_value: + os.environ[OSS_BACKEND_ENV] = str(backend_value) + + session_id_default = None + if model_name is not None: + try: + session_id_default = f"{model_name}@{int(time.time())}" + except Exception: # noqa: BLE001 + session_id_default = None + session_id = ( + os.environ.get("BFCL_SESSION_ID") or session_id_default + ) + + entry_inv = EntryInvocation(session_id=session_id) + handler = get_extended_telemetry_handler() + + attributes = {GEN_AI_FRAMEWORK: FRAMEWORK_NAME} + category_value = _join_test_category( + _safe_get(cli_args, "test_category", None) + ) + if category_value: + attributes[BFCL_TEST_CATEGORY] = category_value + num_threads = _safe_get(cli_args, "num_threads", None) + if num_threads is not None: + try: + attributes[BFCL_NUM_THREADS] = int(num_threads) + except (TypeError, ValueError): + pass + if isinstance(test_cases_total, (list, tuple)): + attributes[BFCL_TEST_CASE_COUNT] = len(test_cases_total) + attributes[BFCL_RUN_IDS] = bool( + _safe_get(cli_args, "run_ids", False) + ) + + try: + with handler.entry(entry_inv) as inv: + if inv.span is not None and inv.span.is_recording(): + for key, value in attributes.items(): + try: + inv.span.set_attribute(key, value) + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 ENTRY set_attribute(%s) failed", + key, + exc_info=True, + ) + return wrapped(*args, **kwargs) + finally: + if original_executor is not None: + try: + _bfcl_gen.ThreadPoolExecutor = original_executor + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 ENTRY: failed to restore ThreadPoolExecutor", + exc_info=True, + ) + if backend_value: + if previous_backend_env is None: + os.environ.pop(OSS_BACKEND_ENV, None) + else: + os.environ[OSS_BACKEND_ENV] = previous_backend_env + + +# --------------------------------------------------------------------------- +# AGENT wrapper + + +_BFCL_INFERENCE_ERROR_PREFIX = "Error during inference:" + + +class BaseHandlerInferenceWrapper: + """Wraps ``BaseHandler.inference``. + + Creates the AGENT span (kind=AGENT, op=invoke_agent) and initialises the + per-thread ReAct state used by the STEP wrapper. + + BFCL's outer ``multi_threaded_inference`` catches every exception and + converts it into a ``"Error during inference: ..."`` string; we mirror + that behaviour by setting the AGENT span status to ERROR when the + returned ``result`` looks like an error string, instead of relying on + a re-raised exception. + """ + + def __init__(self, helper: GenAIHookHelper) -> None: + self._helper = helper + + def __call__(self, wrapped: Callable, instance: Any, args, kwargs): # noqa: D401 + # ``inference(self, test_entry, include_input_log, exclude_state_log)`` + test_entry = args[0] if args else kwargs.get("test_entry") + if not isinstance(test_entry, dict): + return wrapped(*args, **kwargs) + + provider, extra_attrs = infer_provider(instance) + request_model = getattr(instance, "model_name", None) + test_entry_id = test_entry.get("id") + category = _test_category_from_id(test_entry_id) + involved_classes = test_entry.get("involved_classes") or [] + agent_description = ( + ", ".join(str(c) for c in involved_classes) + if isinstance(involved_classes, (list, tuple)) + else None + ) + + invocation = InvokeAgentInvocation( + provider=provider or "unknown", + request_model=request_model, + agent_id=test_entry_id, + agent_name=category or "bfcl_agent", + agent_description=agent_description or None, + conversation_id=test_entry_id, + ) + + token = init_state() + handler = get_extended_telemetry_handler() + try: + with handler.invoke_agent(invocation) as inv: + if inv.span is not None and inv.span.is_recording(): + inv.span.set_attribute(GEN_AI_FRAMEWORK, FRAMEWORK_NAME) + if provider: + inv.span.set_attribute(GEN_AI_PROVIDER_NAME, provider) + if test_entry_id is not None: + inv.span.set_attribute( + BFCL_TEST_ENTRY_ID, test_entry_id + ) + if category is not None: + inv.span.set_attribute(BFCL_TEST_CATEGORY, category) + for key, value in extra_attrs.items(): + if value is not None: + inv.span.set_attribute(key, value) + + # Capture inputs for the AGENT (gated by content-capture mode). + question = test_entry.get("question") + if question is not None: + inv.input_messages = to_text_input( + "user", truncate_text(_safe_str(question)) + ) + + # Run the original inference call. + try: + result = wrapped(*args, **kwargs) + except Exception as exc: + # The CM will mark the span as failed; we leave it to + # the handler/CM to call ``fail_invoke_agent``. + raise exc + + # Detect BFCL's own captured error path (no exception raised + # but the returned result is the error string). + result_payload = ( + result[0] if isinstance(result, tuple) and result else None + ) + metadata_payload = ( + result[1] + if isinstance(result, tuple) and len(result) >= 2 + else None + ) + + if ( + isinstance(result_payload, str) + and result_payload.startswith(_BFCL_INFERENCE_ERROR_PREFIX) + and inv.span is not None + and inv.span.is_recording() + ): + try: + from opentelemetry.trace import Status, StatusCode + + inv.span.set_status( + Status(StatusCode.ERROR, result_payload[:200]) + ) + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 AGENT: failed to set ERROR status", + exc_info=True, + ) + + if isinstance(metadata_payload, dict): + input_tokens = _flatten_tokens( + metadata_payload.get("input_token_count") + ) + output_tokens = _flatten_tokens( + metadata_payload.get("output_token_count") + ) + if input_tokens is not None: + inv.input_tokens = input_tokens + if output_tokens is not None: + inv.output_tokens = output_tokens + + if result_payload is not None: + inv.output_messages = to_text_output( + "assistant", + truncate_text(_safe_str(result_payload)), + ) + + return result + finally: + reset_state(token) + + +def _safe_str(value: Any) -> str: + try: + if isinstance(value, str): + return value + import json + + return json.dumps(value, ensure_ascii=False, default=str) + except Exception: # noqa: BLE001 + try: + return str(value) + except Exception: # noqa: BLE001 + return "" + + +# --------------------------------------------------------------------------- +# STEP wrapper + + +class QueryWrapper: + """Wraps ``._query_FC`` / ``_query_prompting``. + + Creates a ReAct STEP span, attaches token usage by re-calling the + handler's matching ``_parse_query_response_*`` (which is documented as + side-effect-free). + """ + + def __init__(self, helper: GenAIHookHelper, mode: str) -> None: + self._helper = helper + self._mode = mode # "FC" or "prompting" + + def __call__(self, wrapped: Callable, instance: Any, args, kwargs): # noqa: D401 + round_idx = bump_round() + provider, extra_attrs = infer_provider(instance) + + invocation = ReactStepInvocation(round=round_idx) + handler_obj = get_extended_telemetry_handler() + with handler_obj.react_step(invocation) as step_inv: + span = step_inv.span + if span is not None and span.is_recording(): + span.set_attribute(GEN_AI_FRAMEWORK, FRAMEWORK_NAME) + span.set_attribute(BFCL_QUERY_MODE, self._mode) + if provider: + span.set_attribute(GEN_AI_PROVIDER_NAME, provider) + model_name = getattr(instance, "model_name", None) + if model_name: + span.set_attribute( + "gen_ai.request.model", str(model_name) + ) + from opentelemetry.instrumentation.bfclv4.internal.state import ( + get_state, + ) + + state = get_state() + if state is not None: + span.set_attribute(BFCL_TURN_IDX, state.get("turn_idx", 0)) + for key, value in extra_attrs.items(): + if value is not None: + span.set_attribute(key, value) + + try: + api_response, query_latency = wrapped(*args, **kwargs) + except Exception: + # Let the context-manager mark the span as failed; the BFCL + # outer try/except will turn this into an "Error during + # inference: ..." result string at the AGENT layer. + raise + + # When the underlying handler returns a streaming wrapper + # (e.g. ``ChatStreamWrapper`` from openai-v2), the LLM span and + # its OTel context attach are kept alive until the stream is + # consumed by BFCL's ``_parse_query_response_*`` *outside* of + # this STEP context manager. That breaks the LIFO ordering of + # context attach/detach, leaving the LLM span as the "current" + # span after the STEP CM exits, which causes the next STEP and + # any TOOL spans to be parented to the previous STEP rather + # than to the AGENT. + # + # To preserve LIFO ordering, force-consume the stream here + # (inside the STEP context) and replace it with a plain + # iterator over the cached chunks. This makes ``stop_llm`` + # (which detaches the LLM context) run *before* STEP detaches. + if api_response is not None and hasattr( + api_response, "__next__" + ) and not isinstance(api_response, (str, bytes)): + try: + chunks = list(api_response) + api_response = iter(chunks) + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 STEP: failed to materialise streaming " + "response; LLM/STEP nesting may be incorrect", + exc_info=True, + ) + + # Post-call attribute enrichment - use try/except so that any + # vendor-side parsing surprise never breaks BFCL itself. + # + # IMPORTANT: We must NOT re-call ``_parse_query_response_*`` here, + # because for streaming providers (e.g. Qwen DashScope) the + # ``api_response`` is a single-pass generator that the parser + # consumes; calling it twice leaves BFCL's own subsequent call to + # the parser with an exhausted iterator, which crashes inference + # with ``UnboundLocalError: chunk``. Token usage will instead be + # recovered later from the AGENT-level metadata payload. + try: + if span is not None and span.is_recording(): + if isinstance(query_latency, (int, float)): + try: + span.set_attribute( + "gen_ai.response.time_to_first_token", + int(float(query_latency) * 1e9), + ) + except Exception: # noqa: BLE001 + pass + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 STEP: post-call enrichment failed", exc_info=True + ) + + return api_response, query_latency + + +def _infer_finish_reason(model_responses: Any) -> str: + """Best-effort heuristic for ``gen_ai.react.finish_reason``.""" + if model_responses is None: + return "unknown" + if isinstance(model_responses, list): + if len(model_responses) == 0: + return "empty_response" + if len(model_responses) == 1 and not model_responses[0]: + return "empty_response" + return "tool_calls" + if isinstance(model_responses, str): + # Prompting models often return decoded strings even when there are + # no tool calls - treat as "stop" so downstream callers know there is + # no further work to do. + return "stop" + return "continue" + + +# --------------------------------------------------------------------------- +# turn_idx maintenance wrappers (no spans) + + +class TurnBumpWrapper: + """Wraps ``.add_first_turn_message_*`` and + ``._add_next_turn_user_message_*`` to keep ``bfcl.turn_idx`` in + sync. No spans are created here. + """ + + def __init__(self, *, reset: bool) -> None: + self._reset = reset + + def __call__(self, wrapped: Callable, instance: Any, args, kwargs): # noqa: D401 + try: + if self._reset: + # ``add_first_turn_message_*`` runs once at the very start of + # multi-turn / single-turn inference. We only want to reset + # to ``turn_idx=0`` here. + from opentelemetry.instrumentation.bfclv4.internal.state import ( + get_state, + ) + + state = get_state() + if state is not None: + state["turn_idx"] = 0 + state["fc_round"] = 0 + else: + bump_turn() + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4: turn_idx maintenance failed", exc_info=True + ) + return wrapped(*args, **kwargs) + + +# --------------------------------------------------------------------------- +# TOOL wrapper + + +class ExecuteFuncCallWrapper: + """Wraps + ``bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils.execute_multi_turn_func_call``. + + BFCL evaluates a list of function-call strings in a single Python call; + we surface each one as its own TOOL span by post-processing the wrapped + result. Per-call latency is approximated by averaging the total elapsed + time across the batch (``bfcl.tool.duration_is_estimated=true``). + """ + + def __init__(self, helper: GenAIHookHelper) -> None: + self._helper = helper + + def __call__(self, wrapped: Callable, instance: Any, args, kwargs): # noqa: D401 + # ``execute_multi_turn_func_call(func_call_list, initial_config, + # involved_classes, model_name, + # test_entry_id, long_context=False, + # is_evaL_run=False)`` + func_call_list = ( + args[0] if args else kwargs.get("func_call_list", []) + ) + model_name = ( + args[3] + if len(args) >= 4 + else kwargs.get("model_name") + ) + test_entry_id = ( + args[4] + if len(args) >= 5 + else kwargs.get("test_entry_id") + ) + + if not isinstance(func_call_list, list) or not func_call_list: + return wrapped(*args, **kwargs) + + t0 = time.perf_counter() + try: + result = wrapped(*args, **kwargs) + finally: + elapsed = max(time.perf_counter() - t0, 0.0) + + execution_results: List[str] = [] + if isinstance(result, tuple) and result: + payload = result[0] + if isinstance(payload, list): + execution_results = list(payload) + + per_call_seconds = ( + elapsed / len(func_call_list) if func_call_list else 0.0 + ) + + handler_obj = get_extended_telemetry_handler() + for index, func_call in enumerate(func_call_list): + tool_name = _extract_tool_name(func_call) + arguments = _extract_tool_arguments(func_call) + execution_result = ( + execution_results[index] + if index < len(execution_results) + else None + ) + + tool_inv = ExecuteToolInvocation( + tool_name=tool_name, + tool_call_id=_synth_tool_call_id( + test_entry_id, model_name, index + ), + tool_type="function", + tool_call_arguments=arguments, + tool_call_result=execution_result, + ) + + try: + with handler_obj.execute_tool(tool_inv) as inv: + span = inv.span + if span is not None and span.is_recording(): + span.set_attribute(GEN_AI_FRAMEWORK, FRAMEWORK_NAME) + span.set_attribute(BFCL_TOOL_INDEX, index) + span.set_attribute( + BFCL_TOOL_DURATION_IS_ESTIMATED, True + ) + if test_entry_id is not None: + span.set_attribute( + BFCL_TEST_ENTRY_ID, str(test_entry_id) + ) + if isinstance(execution_result, str) and execution_result.startswith( + "Error during execution:" + ): + try: + from opentelemetry.trace import ( + Status, + StatusCode, + ) + + span.set_status( + Status( + StatusCode.ERROR, + execution_result[:200], + ) + ) + except Exception: # noqa: BLE001 + pass + # Approximate latency by sleeping the budgeted slice + # would distort BFCL execution; we instead rely on + # span start/end (currently both wall-clock-now). + # The ``bfcl.tool.duration_is_estimated`` attribute + # signals the limitation to consumers. + _ = per_call_seconds # unused but documented + # Bump a per-AGENT counter for downstream debugging. + next_tool_index() + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 TOOL: span emission failed for %s", + tool_name, + exc_info=True, + ) + + return result + + +def _extract_tool_name(func_call: Any) -> str: + if not isinstance(func_call, str) or "(" not in func_call: + return "unknown" + head = func_call.split("(", 1)[0] + # ``head`` may be ``module.method`` or ``instance.method`` - keep the + # last segment which is the actual callable. + return head.split(".")[-1] or "unknown" + + +def _extract_tool_arguments(func_call: Any) -> Optional[str]: + if not isinstance(func_call, str): + return None + if "(" not in func_call or not func_call.endswith(")"): + return func_call + args_part = func_call[func_call.index("(") + 1 : -1] + return args_part if args_part else None + + +def _synth_tool_call_id( + test_entry_id: Optional[Any], model_name: Optional[Any], index: int +) -> str: + parts = [ + str(test_entry_id) if test_entry_id is not None else "no_id", + str(model_name) if model_name is not None else "no_model", + str(index), + ] + return "-".join(parts) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/package.py new file mode 100644 index 000000000..66e9fa6e1 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/package.py @@ -0,0 +1,17 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +_instruments = ("bfcl-eval >= 4.0.0",) + +_supports_metrics = False diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/utils.py new file mode 100644 index 000000000..c63bbc62b --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/utils.py @@ -0,0 +1,144 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helpers for the BFCL v4 instrumentation. + +The :class:`GenAIHookHelper` mirrors the helper used by the LoongSuite CrewAI +instrumentation: it gates ``gen_ai.input.messages`` / +``gen_ai.output.messages`` / ``gen_ai.system_instructions`` on the standard +LoongSuite content-capture environment knobs so that prompt content is not +exported by default. +""" + +from __future__ import annotations + +import dataclasses +import logging +from typing import Any, Dict, List, Optional + +from opentelemetry.semconv._incubating.attributes import gen_ai_attributes +from opentelemetry.trace import Span +from opentelemetry.util.genai.types import ( + ContentCapturingMode, + InputMessage, + MessagePart, + OutputMessage, + Text, +) +from opentelemetry.util.genai.utils import ( + gen_ai_json_dumps, + get_content_capturing_mode, + is_experimental_mode, +) + +logger = logging.getLogger(__name__) + + +class GenAIHookHelper: + """Conditionally write prompt / completion content to the span.""" + + def __init__(self, capture_content: bool = True) -> None: + self.capture_content = capture_content + + def on_completion( + self, + span: Span, + inputs: Optional[List[InputMessage]] = None, + outputs: Optional[List[OutputMessage]] = None, + system_instructions: Optional[List[MessagePart]] = None, + attributes: Optional[Dict[str, Any]] = None, + ) -> None: + if not span.is_recording(): + return + + if self.capture_content and is_experimental_mode(): + mode = get_content_capturing_mode() + should_capture_span = mode in ( + ContentCapturingMode.SPAN_ONLY, + ContentCapturingMode.SPAN_AND_EVENT, + ) + + if should_capture_span: + if inputs: + span.set_attribute( + gen_ai_attributes.GEN_AI_INPUT_MESSAGES, + gen_ai_json_dumps( + [dataclasses.asdict(i) for i in inputs] + ), + ) + if outputs: + span.set_attribute( + gen_ai_attributes.GEN_AI_OUTPUT_MESSAGES, + gen_ai_json_dumps( + [dataclasses.asdict(o) for o in outputs] + ), + ) + if system_instructions: + span.set_attribute( + gen_ai_attributes.GEN_AI_SYSTEM_INSTRUCTIONS, + gen_ai_json_dumps( + [dataclasses.asdict(s) for s in system_instructions] + ), + ) + + if attributes: + for key, value in attributes.items(): + if value is None: + continue + try: + span.set_attribute(key, value) + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4: failed to set attribute %s", key, exc_info=True + ) + + +def to_text_input(role: str, content: Any) -> List[InputMessage]: + if content in (None, "", [], {}): + return [] + text = content if isinstance(content, str) else _to_safe_str(content) + return [InputMessage(role=role, parts=[Text(content=text)])] + + +def to_text_output( + role: str, content: Any, finish_reason: str = "stop" +) -> List[OutputMessage]: + if content in (None, "", [], {}): + return [] + text = content if isinstance(content, str) else _to_safe_str(content) + return [ + OutputMessage( + role=role, parts=[Text(content=text)], finish_reason=finish_reason + ) + ] + + +def _to_safe_str(value: Any) -> str: + """Best-effort JSON serialisation, falling back to ``str()``. + + The wrapper code never wants a serialisation failure to break a span. + """ + try: + return gen_ai_json_dumps(value) + except Exception: # noqa: BLE001 + try: + return str(value) + except Exception: # noqa: BLE001 + return "" + + +def truncate_text(value: str, limit: int = 4096) -> str: + if len(value) <= limit: + return value + return value[:limit] + f"..." diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/version.py new file mode 100644 index 000000000..3263662eb --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/version.py @@ -0,0 +1,15 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__version__ = "0.1.3.dev0" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_instrumentor.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_instrumentor.py new file mode 100644 index 000000000..41446ee3b --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_instrumentor.py @@ -0,0 +1,52 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Smoke tests for ``BFCLv4Instrumentor``. + +These tests do not require ``bfcl-eval`` to be installed; they only verify +that importing the package and calling ``instrument()`` / ``uninstrument()`` +works (and degrades gracefully when ``bfcl-eval`` is missing). +""" + +import importlib + +import pytest + + +def test_import_instrumentor_package(): + module = importlib.import_module("opentelemetry.instrumentation.bfclv4") + assert hasattr(module, "BFCLv4Instrumentor") + + +def test_instrumentation_dependencies_listed(): + from opentelemetry.instrumentation.bfclv4 import BFCLv4Instrumentor + from opentelemetry.instrumentation.bfclv4.package import _instruments + + instr = BFCLv4Instrumentor() + assert tuple(instr.instrumentation_dependencies()) == _instruments + + +def test_instrument_uninstrument_no_bfcl_no_raise(): + """When ``bfcl-eval`` is missing, every wrap call logs and continues. + + The instrumentor must not raise from ``instrument()`` / + ``uninstrument()`` even if the target framework cannot be imported. + """ + + pytest.importorskip("opentelemetry.util.genai.extended_handler") + from opentelemetry.instrumentation.bfclv4 import BFCLv4Instrumentor + + instr = BFCLv4Instrumentor() + instr.instrument() + instr.uninstrument() diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_internals.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_internals.py new file mode 100644 index 000000000..21bbf6348 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_internals.py @@ -0,0 +1,113 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for the framework-agnostic helpers.""" + +import contextvars + +import pytest + + +def test_state_lifecycle(): + from opentelemetry.instrumentation.bfclv4.internal.state import ( + bump_round, + bump_turn, + get_state, + init_state, + next_tool_index, + reset_state, + ) + + token = init_state() + try: + state = get_state() + assert state == {"turn_idx": 0, "fc_round": 0, "tool_index": 0} + + assert bump_round() == 1 + assert bump_round() == 2 + assert bump_turn() == 1 + # bump_turn resets fc_round + state = get_state() + assert state["turn_idx"] == 1 + assert state["fc_round"] == 0 + assert next_tool_index() == 0 + assert next_tool_index() == 1 + finally: + reset_state(token) + + # After reset the state should be gone (None default). + assert get_state() is None + + +def test_context_propagating_executor_carries_contextvars(): + from opentelemetry.instrumentation.bfclv4.internal.threading_propagation import ( + ContextPropagatingExecutor, + ) + + cv: contextvars.ContextVar[str] = contextvars.ContextVar( + "bfclv4_test_cv", default="default" + ) + cv.set("from_main_thread") + + def _read(): + return cv.get() + + with ContextPropagatingExecutor(max_workers=2) as pool: + future = pool.submit(_read) + assert future.result() == "from_main_thread" + + +def test_extract_tool_name_and_arguments(): + from opentelemetry.instrumentation.bfclv4.internal.wrappers import ( + _extract_tool_arguments, + _extract_tool_name, + ) + + assert _extract_tool_name("calc.add(1, 2)") == "add" + assert _extract_tool_name("list_files()") == "list_files" + assert _extract_tool_name("not a call") == "unknown" + assert _extract_tool_arguments("foo(a=1, b=2)") == "a=1, b=2" + assert _extract_tool_arguments("foo()") is None + + +def test_infer_finish_reason_heuristic(): + from opentelemetry.instrumentation.bfclv4.internal.wrappers import ( + _infer_finish_reason, + ) + + assert _infer_finish_reason([]) == "empty_response" + assert _infer_finish_reason([[]]) == "empty_response" + assert _infer_finish_reason([{"name": "x"}]) == "tool_calls" + assert _infer_finish_reason("plain string") == "stop" + assert _infer_finish_reason(None) == "unknown" + + +def test_provider_mapping_without_bfcl(monkeypatch): + from opentelemetry.instrumentation.bfclv4.internal.provider import ( + infer_provider, + ) + + pytest.importorskip( + "opentelemetry.util.genai.extended_types", + ) + + class _Dummy: + model_style = None + + name, extras = infer_provider(_Dummy()) + # If bfcl-eval is not installed, ``ModelStyle`` import fails and we get + # ``unknown``; otherwise we still get ``unknown`` because ``model_style`` + # is None. + assert name == "unknown" + assert extras == {}