From cccf54b0b00c0d8b58c575798329268ad0d2b07f Mon Sep 17 00:00:00 2001 From: musi Date: Wed, 6 May 2026 18:38:36 +0800 Subject: [PATCH 1/3] feat(bfclv4): add instrumentation for Berkeley Function Call Leaderboard v4 Introduce loongsuite-instrumentation-bfclv4 covering BFCL v4 (bfcl_eval) per the design in llm-dev/bfclv4/execute.md: * ENTRY span around bfcl_eval._llm_response_generation.generate_results, with a narrow swap of that module's ThreadPoolExecutor name to a contextvars-propagating subclass so worker threads inherit the ENTRY trace context. * AGENT span around BaseHandler.inference (kind=AGENT, op=invoke_agent), picking up token usage from the metadata BFCL writes back. * STEP spans created reflectively for every concrete handler discovered via bfcl_eval.constants.model_config.MODEL_CONFIG_MAPPING; each STEP re-invokes the handler's _parse_query_response_* to harvest token counts and latency. * Per-call TOOL spans emitted from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils.execute_multi_turn_func_call (one span per func_call entry in the batch). * Provider override mapping that routes OSSMODEL handlers to vllm/sglang based on args.backend, plus contextvars-based bfcl.turn_idx / gen_ai.react.round tracking. LLM spans are intentionally not created by this plugin; they continue to be produced by the downstream vendor SDK probes (OpenAI / Anthropic / DashScope / etc.). --- .../CHANGELOG.md | 22 + .../README.md | 79 ++ .../pyproject.toml | 54 ++ .../instrumentation/bfclv4/__init__.py | 297 ++++++++ .../bfclv4/internal/__init__.py | 13 + .../bfclv4/internal/attributes.py | 38 + .../bfclv4/internal/provider.py | 71 ++ .../instrumentation/bfclv4/internal/state.py | 93 +++ .../bfclv4/internal/threading_propagation.py | 43 ++ .../bfclv4/internal/wrappers.py | 691 ++++++++++++++++++ .../instrumentation/bfclv4/package.py | 17 + .../instrumentation/bfclv4/utils.py | 144 ++++ .../instrumentation/bfclv4/version.py | 15 + .../tests/__init__.py | 0 .../tests/test_instrumentor.py | 52 ++ .../tests/test_internals.py | 113 +++ 16 files changed, 1742 insertions(+) create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/CHANGELOG.md create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/README.md create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/pyproject.toml create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/__init__.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/__init__.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/attributes.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/provider.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/state.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/threading_propagation.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/package.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/utils.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/version.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/__init__.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_instrumentor.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_internals.py diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/CHANGELOG.md b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/CHANGELOG.md new file mode 100644 index 000000000..62fb6539b --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/CHANGELOG.md @@ -0,0 +1,22 @@ +# Changelog + +All notable changes to the LoongSuite BFCL v4 instrumentation are documented +in this file. + +## Unreleased + +### Added + +- Initial release of `loongsuite-instrumentation-bfclv4`. +- ENTRY span around `bfcl_eval._llm_response_generation.generate_results`. +- AGENT span around `bfcl_eval.model_handler.base_handler.BaseHandler.inference` + with cross-thread OTel context propagation via a narrow patch of + `bfcl_eval._llm_response_generation.ThreadPoolExecutor`. +- STEP spans created by reflectively wrapping each handler's + `_query_FC` / `_query_prompting` (discovered via + `bfcl_eval.constants.model_config.MODEL_CONFIG_MAPPING`). +- Per-call TOOL spans emitted by wrapping + `bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils.execute_multi_turn_func_call`. +- Provider override mapping for OSS handlers (vLLM / SGLang). +- Multi-turn `bfcl.turn_idx` and ReAct `gen_ai.react.round` tracking via + `contextvars`. diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/README.md b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/README.md new file mode 100644 index 000000000..7a4e5d69d --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/README.md @@ -0,0 +1,79 @@ +# LoongSuite BFCL v4 Instrumentation + +LoongSuite Python instrumentation for the [Berkeley Function Call +Leaderboard v4](https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard) +(`bfcl-eval`, package `bfcl_eval`). + +## Span Topology + +``` +ENTRY enter_ai_application_system gen_ai.span.kind=ENTRY, op=enter +└─ AGENT invoke_agent {test_entry_id} gen_ai.span.kind=AGENT, op=invoke_agent + ├─ STEP react step gen_ai.span.kind=STEP, op=react + │ ├─ LLM chat {model} (created by downstream vendor SDK probe) + │ └─ TOOL execute_tool {fn} gen_ai.span.kind=TOOL, op=execute_tool + └─ STEP react step + └─ ... +``` + +This instrumentation deliberately does **not** create LLM spans. They are +emitted by the downstream vendor SDK probe (OpenAI / Anthropic / Google / +DashScope / LiteLLM / etc.) so that token usage and request payloads stay in +sync with the SDK that actually performed the request. + +## Installation + +```bash +pip install loongsuite-instrumentation-bfclv4 +``` + +## Usage + +```bash +opentelemetry-instrument bfcl generate \ + --model gpt-4o-2024-11-20-FC \ + --test-category simple_python \ + --num-threads 2 +``` + +Or programmatically: + +```python +from opentelemetry.instrumentation.bfclv4 import BFCLv4Instrumentor + +BFCLv4Instrumentor().instrument() +# ... run BFCL ... +BFCLv4Instrumentor().uninstrument() +``` + +## Compatibility With Downstream LLM SDK Probes + +| Scenario | Recommended downstream probe | +| --- | --- | +| OpenAI / OpenAI Responses / OSS via vLLM / SGLang / DeepSeek (OpenAI-compatible) | `opentelemetry-instrumentation-openai` | +| Anthropic / Claude | `loongsuite-instrumentation-claude-agent-sdk` | +| Gemini / Google | `loongsuite-instrumentation-google-adk` | +| Qwen / DashScope | `loongsuite-instrumentation-dashscope` | +| LiteLLM | `loongsuite-instrumentation-litellm` | + +## OSS Provider Notes + +For OSS handlers (vLLM / SGLang served via the OpenAI-compatible API), the +BFCL probe sets `gen_ai.provider.name` to `vllm` / `sglang` / `oss` and adds +`bfcl.oss.backend` for disambiguation. Downstream OpenAI probes will still +report `gen_ai.provider.name=openai` on the LLM span; this is expected. + +## Custom Attributes + +| Attribute | Where | Description | +| --- | --- | --- | +| `gen_ai.framework` = `bfclv4` | ENTRY/AGENT/STEP/TOOL | Framework tag | +| `bfcl.test_category` | ENTRY/AGENT | Test category | +| `bfcl.num_threads` | ENTRY | Configured thread pool size | +| `bfcl.test_case_count` | ENTRY | Number of test cases | +| `bfcl.run_ids` | ENTRY | Whether the run targeted specific IDs | +| `bfcl.test_entry_id` | AGENT | Test entry id | +| `bfcl.turn_idx` | STEP | Multi-turn turn index (0-based) | +| `bfcl.query_mode` | STEP | `FC` or `prompting` | +| `bfcl.oss.backend` | AGENT/STEP | `vllm` / `sglang` / `unknown` (only OSS) | +| `bfcl.tool.duration_is_estimated` | TOOL | True (latency is averaged across batch) | diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/pyproject.toml new file mode 100644 index 000000000..3eeb5d026 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/pyproject.toml @@ -0,0 +1,54 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "loongsuite-instrumentation-bfclv4" +dynamic = ["version"] +description = "LoongSuite BFCL v4 (Berkeley Function Call Leaderboard) instrumentation" +readme = "README.md" +license = "Apache-2.0" +requires-python = ">=3.10,<4" +authors = [ + { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +dependencies = [ + "opentelemetry-api >= 1.37.0", + "opentelemetry-instrumentation >= 0.58b0", + "opentelemetry-semantic-conventions >= 0.58b0", + "wrapt >= 1.0.0, < 2.0.0", + "opentelemetry-util-genai >= 0.3b0.dev0", +] + +[project.optional-dependencies] +instruments = [ + "bfcl-eval >= 4.0.0", +] + +[project.entry-points.opentelemetry_instrumentor] +bfclv4 = "opentelemetry.instrumentation.bfclv4:BFCLv4Instrumentor" + +[project.urls] +Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4" +Repository = "https://github.com/alibaba/loongsuite-python-agent" + +[tool.hatch.version] +path = "src/opentelemetry/instrumentation/bfclv4/version.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/opentelemetry"] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/__init__.py new file mode 100644 index 000000000..34a5a9b10 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/__init__.py @@ -0,0 +1,297 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""LoongSuite BFCL v4 (Berkeley Function Call Leaderboard) instrumentation. + +Usage +----- + +.. code:: python + + from opentelemetry.instrumentation.bfclv4 import BFCLv4Instrumentor + + BFCLv4Instrumentor().instrument() + # ... run BFCL ... + BFCLv4Instrumentor().uninstrument() + +API +--- +""" + +from __future__ import annotations + +import importlib +import logging +from typing import Any, Collection, List, Tuple + +from wrapt import wrap_function_wrapper + +from opentelemetry.instrumentation.bfclv4.internal.wrappers import ( + BaseHandlerInferenceWrapper, + ExecuteFuncCallWrapper, + GenerateResultsWrapper, + QueryWrapper, + TurnBumpWrapper, +) +from opentelemetry.instrumentation.bfclv4.package import _instruments +from opentelemetry.instrumentation.bfclv4.utils import GenAIHookHelper +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor +from opentelemetry.instrumentation.utils import unwrap + +logger = logging.getLogger(__name__) + +__all__ = ["BFCLv4Instrumentor"] + + +_GENERATE_RESULTS_MODULE = "bfcl_eval._llm_response_generation" +_GENERATE_RESULTS_NAME = "generate_results" + +_BASE_HANDLER_MODULE = "bfcl_eval.model_handler.base_handler" +_BASE_HANDLER_NAME = "BaseHandler.inference" + +_EXECUTE_TOOL_MODULE = ( + "bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils" +) +_EXECUTE_TOOL_NAME = "execute_multi_turn_func_call" + + +# ``MODEL_CONFIG_MAPPING`` already imports every concrete handler at module +# load time, so iterating over its values gives us the canonical handler +# class set without risking new vendor SDK imports. +def _iter_handler_classes() -> List[type]: + try: + from bfcl_eval.constants.model_config import ( # noqa: PLC0415 + MODEL_CONFIG_MAPPING, + ) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: cannot import MODEL_CONFIG_MAPPING: %s", exc + ) + return [] + + classes: List[type] = [] + seen_class_ids: set[int] = set() + for cfg in MODEL_CONFIG_MAPPING.values(): + cls = getattr(cfg, "model_handler", None) + if cls is None or not isinstance(cls, type): + continue + if id(cls) in seen_class_ids: + continue + seen_class_ids.add(id(cls)) + classes.append(cls) + return classes + + +class BFCLv4Instrumentor(BaseInstrumentor): + """An instrumentor for the BFCL v4 (``bfcl_eval``) framework.""" + + def __init__(self) -> None: + super().__init__() + if not hasattr(self, "_wrapped_query_methods"): + self._wrapped_query_methods: List[Tuple[type, str]] = [] + if not hasattr(self, "_wrapped_turn_methods"): + self._wrapped_turn_methods: List[Tuple[type, str]] = [] + if not hasattr(self, "_entry_wrapped"): + self._entry_wrapped = False + if not hasattr(self, "_inference_wrapped"): + self._inference_wrapped = False + if not hasattr(self, "_tool_wrapped"): + self._tool_wrapped = False + + def instrumentation_dependencies(self) -> Collection[str]: + return _instruments + + # ------------------------------------------------------------------ + # _instrument + + def _instrument(self, **kwargs: Any) -> None: # noqa: D401 + helper = GenAIHookHelper() + + # 1) ENTRY ----------------------------------------------------- + try: + wrap_function_wrapper( + module=_GENERATE_RESULTS_MODULE, + name=_GENERATE_RESULTS_NAME, + wrapper=GenerateResultsWrapper(helper), + ) + self._entry_wrapped = True + except Exception as exc: # noqa: BLE001 + logger.warning( + "bfclv4: failed to wrap %s.%s: %s", + _GENERATE_RESULTS_MODULE, + _GENERATE_RESULTS_NAME, + exc, + ) + + # 2) AGENT ----------------------------------------------------- + try: + wrap_function_wrapper( + module=_BASE_HANDLER_MODULE, + name=_BASE_HANDLER_NAME, + wrapper=BaseHandlerInferenceWrapper(helper), + ) + self._inference_wrapped = True + except Exception as exc: # noqa: BLE001 + logger.warning( + "bfclv4: failed to wrap %s.%s: %s", + _BASE_HANDLER_MODULE, + _BASE_HANDLER_NAME, + exc, + ) + + # 3) STEP + 4) turn maintenance -------------------------------- + self._instrument_handlers(helper) + + # 5) TOOL ------------------------------------------------------ + try: + wrap_function_wrapper( + module=_EXECUTE_TOOL_MODULE, + name=_EXECUTE_TOOL_NAME, + wrapper=ExecuteFuncCallWrapper(helper), + ) + self._tool_wrapped = True + except Exception as exc: # noqa: BLE001 + logger.warning( + "bfclv4: failed to wrap %s.%s: %s", + _EXECUTE_TOOL_MODULE, + _EXECUTE_TOOL_NAME, + exc, + ) + + def _instrument_handlers(self, helper: GenAIHookHelper) -> None: + # Reflectively wrap every concrete ``_query_FC`` / ``_query_prompting`` + # plus the turn-maintenance helpers; we de-duplicate by function id so + # subclasses that share an inherited implementation are wrapped only + # once. + seen_func_ids: set[int] = set() + + query_pairs = ( + ("_query_FC", "FC"), + ("_query_prompting", "prompting"), + ) + turn_pairs = ( + ("add_first_turn_message_FC", True), + ("add_first_turn_message_prompting", True), + ("_add_next_turn_user_message_FC", False), + ("_add_next_turn_user_message_prompting", False), + ) + + for cls in _iter_handler_classes(): + class_dict = getattr(cls, "__dict__", {}) + for method_name, mode in query_pairs: + method = class_dict.get(method_name) + if method is None or not callable(method): + continue + key = id(method) + if key in seen_func_ids: + continue + seen_func_ids.add(key) + try: + wrap_function_wrapper( + module=cls.__module__, + name=f"{cls.__name__}.{method_name}", + wrapper=QueryWrapper(helper, mode), + ) + self._wrapped_query_methods.append((cls, method_name)) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to wrap %s.%s.%s: %s", + cls.__module__, + cls.__name__, + method_name, + exc, + ) + + for method_name, is_first in turn_pairs: + method = class_dict.get(method_name) + if method is None or not callable(method): + continue + key = id(method) + if key in seen_func_ids: + continue + seen_func_ids.add(key) + try: + wrap_function_wrapper( + module=cls.__module__, + name=f"{cls.__name__}.{method_name}", + wrapper=TurnBumpWrapper(reset=is_first), + ) + self._wrapped_turn_methods.append((cls, method_name)) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to wrap %s.%s.%s: %s", + cls.__module__, + cls.__name__, + method_name, + exc, + ) + + # ------------------------------------------------------------------ + # _uninstrument + + def _uninstrument(self, **kwargs: Any) -> None: # noqa: D401 + if self._tool_wrapped: + try: + module = importlib.import_module(_EXECUTE_TOOL_MODULE) + unwrap(module, _EXECUTE_TOOL_NAME) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to unwrap execute_multi_turn_func_call: %s", + exc, + ) + self._tool_wrapped = False + + for cls, method_name in self._wrapped_query_methods: + try: + unwrap(cls, method_name) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to unwrap %s.%s: %s", + cls.__name__, + method_name, + exc, + ) + self._wrapped_query_methods = [] + + for cls, method_name in self._wrapped_turn_methods: + try: + unwrap(cls, method_name) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to unwrap %s.%s: %s", + cls.__name__, + method_name, + exc, + ) + self._wrapped_turn_methods = [] + + if self._inference_wrapped: + try: + base_module = importlib.import_module(_BASE_HANDLER_MODULE) + unwrap(base_module.BaseHandler, "inference") + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to unwrap BaseHandler.inference: %s", exc + ) + self._inference_wrapped = False + + if self._entry_wrapped: + try: + module = importlib.import_module(_GENERATE_RESULTS_MODULE) + unwrap(module, _GENERATE_RESULTS_NAME) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to unwrap generate_results: %s", exc + ) + self._entry_wrapped = False diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/__init__.py new file mode 100644 index 000000000..b0a6f4284 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/__init__.py @@ -0,0 +1,13 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/attributes.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/attributes.py new file mode 100644 index 000000000..774200aba --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/attributes.py @@ -0,0 +1,38 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Constant attribute keys used by the BFCL v4 instrumentation.""" + +from __future__ import annotations + +from typing import Final + +FRAMEWORK_NAME: Final = "bfclv4" + +# gen_ai.* attribute keys that are not exported by +# opentelemetry-semantic-conventions today. +GEN_AI_FRAMEWORK: Final = "gen_ai.framework" +GEN_AI_PROVIDER_NAME: Final = "gen_ai.provider.name" + +# BFCL-specific (vendor) attribute keys. +BFCL_TEST_CATEGORY: Final = "bfcl.test_category" +BFCL_NUM_THREADS: Final = "bfcl.num_threads" +BFCL_TEST_CASE_COUNT: Final = "bfcl.test_case_count" +BFCL_RUN_IDS: Final = "bfcl.run_ids" +BFCL_TEST_ENTRY_ID: Final = "bfcl.test_entry_id" +BFCL_TURN_IDX: Final = "bfcl.turn_idx" +BFCL_QUERY_MODE: Final = "bfcl.query_mode" +BFCL_OSS_BACKEND: Final = "bfcl.oss.backend" +BFCL_TOOL_DURATION_IS_ESTIMATED: Final = "bfcl.tool.duration_is_estimated" +BFCL_TOOL_INDEX: Final = "bfcl.tool.index" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/provider.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/provider.py new file mode 100644 index 000000000..efa2c77dc --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/provider.py @@ -0,0 +1,71 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Map BFCL ``ModelStyle`` enum values to ``gen_ai.provider.name``.""" + +from __future__ import annotations + +import os +from typing import Any, Dict, Tuple + +from opentelemetry.instrumentation.bfclv4.internal.attributes import ( + BFCL_OSS_BACKEND, +) + +# The BFCL backend name (vllm / sglang / ...) is communicated from the ENTRY +# wrapper to the per-thread STEP/AGENT wrappers via this env var. The ENTRY +# wrapper writes to it before invoking the wrapped function and clears it in +# the ``finally`` clause. +OSS_BACKEND_ENV = "BFCL_BACKEND" + + +def infer_provider(handler: Any) -> Tuple[str, Dict[str, Any]]: + """Return ``(provider_name, extra_attributes)`` for a BFCL handler. + + Falls back to ``"unknown"`` if BFCL is not importable or if the handler + has no ``model_style`` attribute. + """ + + try: + from bfcl_eval.constants.enums import ( # noqa: PLC0415 + ModelStyle, + ) + except ImportError: + return "unknown", {} + + style = getattr(handler, "model_style", None) + if style is None: + return "unknown", {} + + if style is ModelStyle.OSSMODEL: + backend = (os.getenv(OSS_BACKEND_ENV) or "").lower() + if backend in ("vllm", "sglang"): + return backend, {BFCL_OSS_BACKEND: backend} + return "oss", {BFCL_OSS_BACKEND: "unknown"} + + mapping = { + ModelStyle.OPENAI_COMPLETIONS: "openai", + ModelStyle.OPENAI_RESPONSES: "openai", + ModelStyle.ANTHROPIC: "anthropic", + ModelStyle.GOOGLE: "gcp.gemini", + ModelStyle.MISTRAL: "mistral_ai", + ModelStyle.COHERE: "cohere", + ModelStyle.AMAZON: "aws.bedrock", + ModelStyle.FIREWORK_AI: "fireworks_ai", + ModelStyle.WRITER: "writer", + ModelStyle.NOVITA_AI: "novita", + ModelStyle.NEXUS: "nexusflow", + ModelStyle.GORILLA: "gorilla", + } + return mapping.get(style, "unknown"), {} diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/state.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/state.py new file mode 100644 index 000000000..ae4861035 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/state.py @@ -0,0 +1,93 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Per-thread ReAct state for the BFCL v4 instrumentation. + +We use ``contextvars.ContextVar`` so that each worker thread spawned by the +BFCL ``ThreadPoolExecutor`` gets its own copy. ``_ContextPropagatingExecutor`` +in :mod:`threading_propagation` makes sure ENTRY-time context is copied into +the worker thread; the BaseHandler.inference wrapper then initializes a fresh +state on top of that copy. +""" + +from __future__ import annotations + +import contextvars +from typing import Any, Dict, Optional + +_REACT_STATE: contextvars.ContextVar[Optional[Dict[str, Any]]] = ( + contextvars.ContextVar("bfclv4_react_state", default=None) +) + + +def init_state() -> contextvars.Token: + """Initialise per-AGENT state and return the reset token.""" + state: Dict[str, Any] = { + # ``turn_idx`` is incremented by the wrapper around + # ``_add_next_turn_user_message_*``; it stays ``0`` for single-turn + # tests. + "turn_idx": 0, + # ``fc_round`` is the ReAct round counter. We bump it on every STEP + # entry so the first STEP within a turn ends up with ``round=1``. + "fc_round": 0, + # Counter of executed tool calls within the current AGENT - useful for + # the TOOL span ``tool_call_id`` synthesis. + "tool_index": 0, + } + return _REACT_STATE.set(state) + + +def reset_state(token: contextvars.Token) -> None: + try: + _REACT_STATE.reset(token) + except (LookupError, ValueError): + # Token may have already been reset (e.g. nested error path). + pass + + +def get_state() -> Optional[Dict[str, Any]]: + return _REACT_STATE.get() + + +def bump_round() -> int: + state = _REACT_STATE.get() + if state is None: + return 1 + state["fc_round"] = state.get("fc_round", 0) + 1 + return state["fc_round"] + + +def reset_round_for_turn() -> None: + state = _REACT_STATE.get() + if state is None: + return + state["fc_round"] = 0 + + +def bump_turn() -> int: + state = _REACT_STATE.get() + if state is None: + return 0 + state["turn_idx"] = state.get("turn_idx", 0) + 1 + state["fc_round"] = 0 + return state["turn_idx"] + + +def next_tool_index() -> int: + state = _REACT_STATE.get() + if state is None: + return 0 + idx = state.get("tool_index", 0) + state["tool_index"] = idx + 1 + return idx diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/threading_propagation.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/threading_propagation.py new file mode 100644 index 000000000..d19c05799 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/threading_propagation.py @@ -0,0 +1,43 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Context-propagating ``ThreadPoolExecutor`` used by the ENTRY wrapper. + +``concurrent.futures.ThreadPoolExecutor`` does not automatically copy the +current ``contextvars`` context (which holds the OTel current span) into +worker threads. We subclass it and copy ``contextvars.copy_context()`` per +``submit`` so the AGENT span created inside the worker thread can attach as +a child of the ENTRY span. + +We only swap the ``ThreadPoolExecutor`` *name* in the +``bfcl_eval._llm_response_generation`` namespace; the global +``concurrent.futures.ThreadPoolExecutor`` is untouched. +""" + +from __future__ import annotations + +import contextvars +from concurrent.futures import ThreadPoolExecutor as _RealExecutor + + +class ContextPropagatingExecutor(_RealExecutor): + """``ThreadPoolExecutor`` that propagates the calling ``Context``. + + Only the ``submit`` method is overridden because BFCL only uses + ``submit`` (see ``_llm_response_generation.generate_results``). + """ + + def submit(self, fn, /, *args, **kwargs): # type: ignore[override] + ctx = contextvars.copy_context() + return super().submit(ctx.run, fn, *args, **kwargs) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py new file mode 100644 index 000000000..9683cb85b --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py @@ -0,0 +1,691 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Wrapper classes for the BFCL v4 instrumentation. + +Each wrapper follows the standard ``wrapt`` callable contract:: + + def __call__(self, wrapped, instance, args, kwargs): + ... + +All wrappers rely on :func:`get_extended_telemetry_handler` (LoongSuite +``util-genai``) to create the actual spans, so that ENTRY / AGENT / STEP / +TOOL spans get the canonical ``gen_ai.span.kind`` and operation-name values +that the LoongSuite semantic-validator expects. +""" + +from __future__ import annotations + +import logging +import os +import time +from typing import Any, Callable, Iterable, List, Optional + +from opentelemetry.instrumentation.bfclv4.internal.attributes import ( + BFCL_NUM_THREADS, + BFCL_OSS_BACKEND, + BFCL_QUERY_MODE, + BFCL_RUN_IDS, + BFCL_TEST_CASE_COUNT, + BFCL_TEST_CATEGORY, + BFCL_TEST_ENTRY_ID, + BFCL_TOOL_DURATION_IS_ESTIMATED, + BFCL_TOOL_INDEX, + BFCL_TURN_IDX, + FRAMEWORK_NAME, + GEN_AI_FRAMEWORK, + GEN_AI_PROVIDER_NAME, +) +from opentelemetry.instrumentation.bfclv4.internal.provider import ( + OSS_BACKEND_ENV, + infer_provider, +) +from opentelemetry.instrumentation.bfclv4.internal.state import ( + bump_round, + bump_turn, + init_state, + next_tool_index, + reset_state, +) +from opentelemetry.instrumentation.bfclv4.internal.threading_propagation import ( + ContextPropagatingExecutor, +) +from opentelemetry.instrumentation.bfclv4.utils import ( + GenAIHookHelper, + to_text_input, + to_text_output, + truncate_text, +) +from opentelemetry.util.genai.extended_handler import ( + get_extended_telemetry_handler, +) +from opentelemetry.util.genai.extended_types import ( + EntryInvocation, + ExecuteToolInvocation, + InvokeAgentInvocation, + ReactStepInvocation, +) + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Helpers + + +def _safe_get(obj: Any, key: str, default: Any = None) -> Any: + if isinstance(obj, dict): + return obj.get(key, default) + return getattr(obj, key, default) + + +def _flatten_tokens(value: Any) -> Optional[int]: + """Sum a possibly nested ``int|float|list|list[list]`` BFCL token field.""" + if value is None: + return None + if isinstance(value, (int, float)): + return int(value) + if isinstance(value, Iterable): + total = 0 + any_seen = False + for item in value: + sub = _flatten_tokens(item) + if sub is not None: + total += sub + any_seen = True + if any_seen: + return total + return None + + +def _test_category_from_id(test_entry_id: Optional[str]) -> Optional[str]: + if not test_entry_id or "_" not in test_entry_id: + return None + return test_entry_id.rsplit("_", 1)[0] + + +def _join_test_category(value: Any) -> Optional[str]: + if value is None: + return None + if isinstance(value, str): + return value + if isinstance(value, (list, tuple, set)): + joined = ",".join(str(v) for v in value if v is not None) + return joined or None + return str(value) + + +# --------------------------------------------------------------------------- +# ENTRY wrapper + + +class GenerateResultsWrapper: + """Wraps ``bfcl_eval._llm_response_generation.generate_results``. + + Responsibilities: + + * Open the ENTRY span (``enter_ai_application_system``). + * Temporarily swap the ``ThreadPoolExecutor`` reference inside the BFCL + generation module to a context-propagating subclass so that AGENT spans + created in worker threads inherit the ENTRY span as parent. + * Publish ``args.backend`` to ``BFCL_BACKEND`` so that + :func:`infer_provider` can attribute OSS spans to vllm / sglang. + """ + + def __init__(self, helper: GenAIHookHelper) -> None: + self._helper = helper + + def __call__(self, wrapped: Callable, instance: Any, args, kwargs): # noqa: D401 + # ``generate_results(args, model_name, test_cases_total)`` + cli_args = args[0] if len(args) >= 1 else kwargs.get("args") + model_name = args[1] if len(args) >= 2 else kwargs.get("model_name") + test_cases_total = ( + args[2] if len(args) >= 3 else kwargs.get("test_cases_total") + ) + + try: + from bfcl_eval import ( # noqa: PLC0415 + _llm_response_generation as _bfcl_gen, + ) + except ImportError: + return wrapped(*args, **kwargs) + + original_executor = getattr(_bfcl_gen, "ThreadPoolExecutor", None) + if original_executor is not None: + _bfcl_gen.ThreadPoolExecutor = ContextPropagatingExecutor + + backend_value = ( + _safe_get(cli_args, "backend", None) if cli_args is not None else None + ) + previous_backend_env = os.environ.get(OSS_BACKEND_ENV) + if backend_value: + os.environ[OSS_BACKEND_ENV] = str(backend_value) + + session_id_default = None + if model_name is not None: + try: + session_id_default = f"{model_name}@{int(time.time())}" + except Exception: # noqa: BLE001 + session_id_default = None + session_id = ( + os.environ.get("BFCL_SESSION_ID") or session_id_default + ) + + entry_inv = EntryInvocation(session_id=session_id) + handler = get_extended_telemetry_handler() + + attributes = {GEN_AI_FRAMEWORK: FRAMEWORK_NAME} + category_value = _join_test_category( + _safe_get(cli_args, "test_category", None) + ) + if category_value: + attributes[BFCL_TEST_CATEGORY] = category_value + num_threads = _safe_get(cli_args, "num_threads", None) + if num_threads is not None: + try: + attributes[BFCL_NUM_THREADS] = int(num_threads) + except (TypeError, ValueError): + pass + if isinstance(test_cases_total, (list, tuple)): + attributes[BFCL_TEST_CASE_COUNT] = len(test_cases_total) + attributes[BFCL_RUN_IDS] = bool( + _safe_get(cli_args, "run_ids", False) + ) + + try: + with handler.entry(entry_inv) as inv: + if inv.span is not None and inv.span.is_recording(): + for key, value in attributes.items(): + try: + inv.span.set_attribute(key, value) + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 ENTRY set_attribute(%s) failed", + key, + exc_info=True, + ) + return wrapped(*args, **kwargs) + finally: + if original_executor is not None: + try: + _bfcl_gen.ThreadPoolExecutor = original_executor + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 ENTRY: failed to restore ThreadPoolExecutor", + exc_info=True, + ) + if backend_value: + if previous_backend_env is None: + os.environ.pop(OSS_BACKEND_ENV, None) + else: + os.environ[OSS_BACKEND_ENV] = previous_backend_env + + +# --------------------------------------------------------------------------- +# AGENT wrapper + + +_BFCL_INFERENCE_ERROR_PREFIX = "Error during inference:" + + +class BaseHandlerInferenceWrapper: + """Wraps ``BaseHandler.inference``. + + Creates the AGENT span (kind=AGENT, op=invoke_agent) and initialises the + per-thread ReAct state used by the STEP wrapper. + + BFCL's outer ``multi_threaded_inference`` catches every exception and + converts it into a ``"Error during inference: ..."`` string; we mirror + that behaviour by setting the AGENT span status to ERROR when the + returned ``result`` looks like an error string, instead of relying on + a re-raised exception. + """ + + def __init__(self, helper: GenAIHookHelper) -> None: + self._helper = helper + + def __call__(self, wrapped: Callable, instance: Any, args, kwargs): # noqa: D401 + # ``inference(self, test_entry, include_input_log, exclude_state_log)`` + test_entry = args[0] if args else kwargs.get("test_entry") + if not isinstance(test_entry, dict): + return wrapped(*args, **kwargs) + + provider, extra_attrs = infer_provider(instance) + request_model = getattr(instance, "model_name", None) + test_entry_id = test_entry.get("id") + category = _test_category_from_id(test_entry_id) + involved_classes = test_entry.get("involved_classes") or [] + agent_description = ( + ", ".join(str(c) for c in involved_classes) + if isinstance(involved_classes, (list, tuple)) + else None + ) + + invocation = InvokeAgentInvocation( + provider=provider or "unknown", + request_model=request_model, + agent_id=test_entry_id, + agent_name=category or "bfcl_agent", + agent_description=agent_description or None, + conversation_id=test_entry_id, + ) + + token = init_state() + handler = get_extended_telemetry_handler() + try: + with handler.invoke_agent(invocation) as inv: + if inv.span is not None and inv.span.is_recording(): + inv.span.set_attribute(GEN_AI_FRAMEWORK, FRAMEWORK_NAME) + if provider: + inv.span.set_attribute(GEN_AI_PROVIDER_NAME, provider) + if test_entry_id is not None: + inv.span.set_attribute( + BFCL_TEST_ENTRY_ID, test_entry_id + ) + if category is not None: + inv.span.set_attribute(BFCL_TEST_CATEGORY, category) + for key, value in extra_attrs.items(): + if value is not None: + inv.span.set_attribute(key, value) + + # Capture inputs for the AGENT (gated by content-capture mode). + question = test_entry.get("question") + if question is not None: + inv.input_messages = to_text_input( + "user", truncate_text(_safe_str(question)) + ) + + # Run the original inference call. + try: + result = wrapped(*args, **kwargs) + except Exception as exc: + # The CM will mark the span as failed; we leave it to + # the handler/CM to call ``fail_invoke_agent``. + raise exc + + # Detect BFCL's own captured error path (no exception raised + # but the returned result is the error string). + result_payload = ( + result[0] if isinstance(result, tuple) and result else None + ) + metadata_payload = ( + result[1] + if isinstance(result, tuple) and len(result) >= 2 + else None + ) + + if ( + isinstance(result_payload, str) + and result_payload.startswith(_BFCL_INFERENCE_ERROR_PREFIX) + and inv.span is not None + and inv.span.is_recording() + ): + try: + from opentelemetry.trace import Status, StatusCode + + inv.span.set_status( + Status(StatusCode.ERROR, result_payload[:200]) + ) + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 AGENT: failed to set ERROR status", + exc_info=True, + ) + + if isinstance(metadata_payload, dict): + input_tokens = _flatten_tokens( + metadata_payload.get("input_token_count") + ) + output_tokens = _flatten_tokens( + metadata_payload.get("output_token_count") + ) + if input_tokens is not None: + inv.input_tokens = input_tokens + if output_tokens is not None: + inv.output_tokens = output_tokens + + if result_payload is not None: + inv.output_messages = to_text_output( + "assistant", + truncate_text(_safe_str(result_payload)), + ) + + return result + finally: + reset_state(token) + + +def _safe_str(value: Any) -> str: + try: + if isinstance(value, str): + return value + import json + + return json.dumps(value, ensure_ascii=False, default=str) + except Exception: # noqa: BLE001 + try: + return str(value) + except Exception: # noqa: BLE001 + return "" + + +# --------------------------------------------------------------------------- +# STEP wrapper + + +class QueryWrapper: + """Wraps ``._query_FC`` / ``_query_prompting``. + + Creates a ReAct STEP span, attaches token usage by re-calling the + handler's matching ``_parse_query_response_*`` (which is documented as + side-effect-free). + """ + + def __init__(self, helper: GenAIHookHelper, mode: str) -> None: + self._helper = helper + self._mode = mode # "FC" or "prompting" + + def __call__(self, wrapped: Callable, instance: Any, args, kwargs): # noqa: D401 + round_idx = bump_round() + provider, extra_attrs = infer_provider(instance) + + invocation = ReactStepInvocation(round=round_idx) + handler_obj = get_extended_telemetry_handler() + with handler_obj.react_step(invocation) as step_inv: + span = step_inv.span + if span is not None and span.is_recording(): + span.set_attribute(GEN_AI_FRAMEWORK, FRAMEWORK_NAME) + span.set_attribute(BFCL_QUERY_MODE, self._mode) + if provider: + span.set_attribute(GEN_AI_PROVIDER_NAME, provider) + model_name = getattr(instance, "model_name", None) + if model_name: + span.set_attribute( + "gen_ai.request.model", str(model_name) + ) + from opentelemetry.instrumentation.bfclv4.internal.state import ( + get_state, + ) + + state = get_state() + if state is not None: + span.set_attribute(BFCL_TURN_IDX, state.get("turn_idx", 0)) + for key, value in extra_attrs.items(): + if value is not None: + span.set_attribute(key, value) + + try: + api_response, query_latency = wrapped(*args, **kwargs) + except Exception: + # Let the context-manager mark the span as failed; the BFCL + # outer try/except will turn this into an "Error during + # inference: ..." result string at the AGENT layer. + raise + + # Post-call attribute enrichment - use try/except so that any + # vendor-side parsing surprise never breaks BFCL itself. + try: + if span is not None and span.is_recording(): + parser_name = ( + "_parse_query_response_FC" + if self._mode == "FC" + else "_parse_query_response_prompting" + ) + parser = getattr(instance, parser_name, None) + if parser is not None: + parsed = parser(api_response) + if isinstance(parsed, dict): + input_token = parsed.get("input_token") + output_token = parsed.get("output_token") + if isinstance(input_token, (int, float)): + span.set_attribute( + "gen_ai.usage.input_tokens", + int(input_token), + ) + if isinstance(output_token, (int, float)): + span.set_attribute( + "gen_ai.usage.output_tokens", + int(output_token), + ) + if isinstance(input_token, (int, float)) and isinstance( + output_token, (int, float) + ): + span.set_attribute( + "gen_ai.usage.total_tokens", + int(input_token) + int(output_token), + ) + model_resp = parsed.get("model_responses") + step_inv.finish_reason = _infer_finish_reason( + model_resp + ) + if isinstance(query_latency, (int, float)): + try: + span.set_attribute( + "gen_ai.response.time_to_first_token", + int(float(query_latency) * 1e9), + ) + except Exception: # noqa: BLE001 + pass + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 STEP: post-call enrichment failed", exc_info=True + ) + + return api_response, query_latency + + +def _infer_finish_reason(model_responses: Any) -> str: + """Best-effort heuristic for ``gen_ai.react.finish_reason``.""" + if model_responses is None: + return "unknown" + if isinstance(model_responses, list): + if len(model_responses) == 0: + return "empty_response" + if len(model_responses) == 1 and not model_responses[0]: + return "empty_response" + return "tool_calls" + if isinstance(model_responses, str): + # Prompting models often return decoded strings even when there are + # no tool calls - treat as "stop" so downstream callers know there is + # no further work to do. + return "stop" + return "continue" + + +# --------------------------------------------------------------------------- +# turn_idx maintenance wrappers (no spans) + + +class TurnBumpWrapper: + """Wraps ``.add_first_turn_message_*`` and + ``._add_next_turn_user_message_*`` to keep ``bfcl.turn_idx`` in + sync. No spans are created here. + """ + + def __init__(self, *, reset: bool) -> None: + self._reset = reset + + def __call__(self, wrapped: Callable, instance: Any, args, kwargs): # noqa: D401 + try: + if self._reset: + # ``add_first_turn_message_*`` runs once at the very start of + # multi-turn / single-turn inference. We only want to reset + # to ``turn_idx=0`` here. + from opentelemetry.instrumentation.bfclv4.internal.state import ( + get_state, + ) + + state = get_state() + if state is not None: + state["turn_idx"] = 0 + state["fc_round"] = 0 + else: + bump_turn() + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4: turn_idx maintenance failed", exc_info=True + ) + return wrapped(*args, **kwargs) + + +# --------------------------------------------------------------------------- +# TOOL wrapper + + +class ExecuteFuncCallWrapper: + """Wraps + ``bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils.execute_multi_turn_func_call``. + + BFCL evaluates a list of function-call strings in a single Python call; + we surface each one as its own TOOL span by post-processing the wrapped + result. Per-call latency is approximated by averaging the total elapsed + time across the batch (``bfcl.tool.duration_is_estimated=true``). + """ + + def __init__(self, helper: GenAIHookHelper) -> None: + self._helper = helper + + def __call__(self, wrapped: Callable, instance: Any, args, kwargs): # noqa: D401 + # ``execute_multi_turn_func_call(func_call_list, initial_config, + # involved_classes, model_name, + # test_entry_id, long_context=False, + # is_evaL_run=False)`` + func_call_list = ( + args[0] if args else kwargs.get("func_call_list", []) + ) + model_name = ( + args[3] + if len(args) >= 4 + else kwargs.get("model_name") + ) + test_entry_id = ( + args[4] + if len(args) >= 5 + else kwargs.get("test_entry_id") + ) + + if not isinstance(func_call_list, list) or not func_call_list: + return wrapped(*args, **kwargs) + + t0 = time.perf_counter() + try: + result = wrapped(*args, **kwargs) + finally: + elapsed = max(time.perf_counter() - t0, 0.0) + + execution_results: List[str] = [] + if isinstance(result, tuple) and result: + payload = result[0] + if isinstance(payload, list): + execution_results = list(payload) + + per_call_seconds = ( + elapsed / len(func_call_list) if func_call_list else 0.0 + ) + + handler_obj = get_extended_telemetry_handler() + for index, func_call in enumerate(func_call_list): + tool_name = _extract_tool_name(func_call) + arguments = _extract_tool_arguments(func_call) + execution_result = ( + execution_results[index] + if index < len(execution_results) + else None + ) + + tool_inv = ExecuteToolInvocation( + tool_name=tool_name, + tool_call_id=_synth_tool_call_id( + test_entry_id, model_name, index + ), + tool_type="function", + tool_call_arguments=arguments, + tool_call_result=execution_result, + ) + + try: + with handler_obj.execute_tool(tool_inv) as inv: + span = inv.span + if span is not None and span.is_recording(): + span.set_attribute(GEN_AI_FRAMEWORK, FRAMEWORK_NAME) + span.set_attribute(BFCL_TOOL_INDEX, index) + span.set_attribute( + BFCL_TOOL_DURATION_IS_ESTIMATED, True + ) + if test_entry_id is not None: + span.set_attribute( + BFCL_TEST_ENTRY_ID, str(test_entry_id) + ) + if isinstance(execution_result, str) and execution_result.startswith( + "Error during execution:" + ): + try: + from opentelemetry.trace import ( + Status, + StatusCode, + ) + + span.set_status( + Status( + StatusCode.ERROR, + execution_result[:200], + ) + ) + except Exception: # noqa: BLE001 + pass + # Approximate latency by sleeping the budgeted slice + # would distort BFCL execution; we instead rely on + # span start/end (currently both wall-clock-now). + # The ``bfcl.tool.duration_is_estimated`` attribute + # signals the limitation to consumers. + _ = per_call_seconds # unused but documented + # Bump a per-AGENT counter for downstream debugging. + next_tool_index() + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 TOOL: span emission failed for %s", + tool_name, + exc_info=True, + ) + + return result + + +def _extract_tool_name(func_call: Any) -> str: + if not isinstance(func_call, str) or "(" not in func_call: + return "unknown" + head = func_call.split("(", 1)[0] + # ``head`` may be ``module.method`` or ``instance.method`` - keep the + # last segment which is the actual callable. + return head.split(".")[-1] or "unknown" + + +def _extract_tool_arguments(func_call: Any) -> Optional[str]: + if not isinstance(func_call, str): + return None + if "(" not in func_call or not func_call.endswith(")"): + return func_call + args_part = func_call[func_call.index("(") + 1 : -1] + return args_part if args_part else None + + +def _synth_tool_call_id( + test_entry_id: Optional[Any], model_name: Optional[Any], index: int +) -> str: + parts = [ + str(test_entry_id) if test_entry_id is not None else "no_id", + str(model_name) if model_name is not None else "no_model", + str(index), + ] + return "-".join(parts) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/package.py new file mode 100644 index 000000000..66e9fa6e1 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/package.py @@ -0,0 +1,17 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +_instruments = ("bfcl-eval >= 4.0.0",) + +_supports_metrics = False diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/utils.py new file mode 100644 index 000000000..c63bbc62b --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/utils.py @@ -0,0 +1,144 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helpers for the BFCL v4 instrumentation. + +The :class:`GenAIHookHelper` mirrors the helper used by the LoongSuite CrewAI +instrumentation: it gates ``gen_ai.input.messages`` / +``gen_ai.output.messages`` / ``gen_ai.system_instructions`` on the standard +LoongSuite content-capture environment knobs so that prompt content is not +exported by default. +""" + +from __future__ import annotations + +import dataclasses +import logging +from typing import Any, Dict, List, Optional + +from opentelemetry.semconv._incubating.attributes import gen_ai_attributes +from opentelemetry.trace import Span +from opentelemetry.util.genai.types import ( + ContentCapturingMode, + InputMessage, + MessagePart, + OutputMessage, + Text, +) +from opentelemetry.util.genai.utils import ( + gen_ai_json_dumps, + get_content_capturing_mode, + is_experimental_mode, +) + +logger = logging.getLogger(__name__) + + +class GenAIHookHelper: + """Conditionally write prompt / completion content to the span.""" + + def __init__(self, capture_content: bool = True) -> None: + self.capture_content = capture_content + + def on_completion( + self, + span: Span, + inputs: Optional[List[InputMessage]] = None, + outputs: Optional[List[OutputMessage]] = None, + system_instructions: Optional[List[MessagePart]] = None, + attributes: Optional[Dict[str, Any]] = None, + ) -> None: + if not span.is_recording(): + return + + if self.capture_content and is_experimental_mode(): + mode = get_content_capturing_mode() + should_capture_span = mode in ( + ContentCapturingMode.SPAN_ONLY, + ContentCapturingMode.SPAN_AND_EVENT, + ) + + if should_capture_span: + if inputs: + span.set_attribute( + gen_ai_attributes.GEN_AI_INPUT_MESSAGES, + gen_ai_json_dumps( + [dataclasses.asdict(i) for i in inputs] + ), + ) + if outputs: + span.set_attribute( + gen_ai_attributes.GEN_AI_OUTPUT_MESSAGES, + gen_ai_json_dumps( + [dataclasses.asdict(o) for o in outputs] + ), + ) + if system_instructions: + span.set_attribute( + gen_ai_attributes.GEN_AI_SYSTEM_INSTRUCTIONS, + gen_ai_json_dumps( + [dataclasses.asdict(s) for s in system_instructions] + ), + ) + + if attributes: + for key, value in attributes.items(): + if value is None: + continue + try: + span.set_attribute(key, value) + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4: failed to set attribute %s", key, exc_info=True + ) + + +def to_text_input(role: str, content: Any) -> List[InputMessage]: + if content in (None, "", [], {}): + return [] + text = content if isinstance(content, str) else _to_safe_str(content) + return [InputMessage(role=role, parts=[Text(content=text)])] + + +def to_text_output( + role: str, content: Any, finish_reason: str = "stop" +) -> List[OutputMessage]: + if content in (None, "", [], {}): + return [] + text = content if isinstance(content, str) else _to_safe_str(content) + return [ + OutputMessage( + role=role, parts=[Text(content=text)], finish_reason=finish_reason + ) + ] + + +def _to_safe_str(value: Any) -> str: + """Best-effort JSON serialisation, falling back to ``str()``. + + The wrapper code never wants a serialisation failure to break a span. + """ + try: + return gen_ai_json_dumps(value) + except Exception: # noqa: BLE001 + try: + return str(value) + except Exception: # noqa: BLE001 + return "" + + +def truncate_text(value: str, limit: int = 4096) -> str: + if len(value) <= limit: + return value + return value[:limit] + f"..." diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/version.py new file mode 100644 index 000000000..4effd145c --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/version.py @@ -0,0 +1,15 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__version__ = "0.1.0.dev" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_instrumentor.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_instrumentor.py new file mode 100644 index 000000000..41446ee3b --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_instrumentor.py @@ -0,0 +1,52 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Smoke tests for ``BFCLv4Instrumentor``. + +These tests do not require ``bfcl-eval`` to be installed; they only verify +that importing the package and calling ``instrument()`` / ``uninstrument()`` +works (and degrades gracefully when ``bfcl-eval`` is missing). +""" + +import importlib + +import pytest + + +def test_import_instrumentor_package(): + module = importlib.import_module("opentelemetry.instrumentation.bfclv4") + assert hasattr(module, "BFCLv4Instrumentor") + + +def test_instrumentation_dependencies_listed(): + from opentelemetry.instrumentation.bfclv4 import BFCLv4Instrumentor + from opentelemetry.instrumentation.bfclv4.package import _instruments + + instr = BFCLv4Instrumentor() + assert tuple(instr.instrumentation_dependencies()) == _instruments + + +def test_instrument_uninstrument_no_bfcl_no_raise(): + """When ``bfcl-eval`` is missing, every wrap call logs and continues. + + The instrumentor must not raise from ``instrument()`` / + ``uninstrument()`` even if the target framework cannot be imported. + """ + + pytest.importorskip("opentelemetry.util.genai.extended_handler") + from opentelemetry.instrumentation.bfclv4 import BFCLv4Instrumentor + + instr = BFCLv4Instrumentor() + instr.instrument() + instr.uninstrument() diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_internals.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_internals.py new file mode 100644 index 000000000..21bbf6348 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_internals.py @@ -0,0 +1,113 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for the framework-agnostic helpers.""" + +import contextvars + +import pytest + + +def test_state_lifecycle(): + from opentelemetry.instrumentation.bfclv4.internal.state import ( + bump_round, + bump_turn, + get_state, + init_state, + next_tool_index, + reset_state, + ) + + token = init_state() + try: + state = get_state() + assert state == {"turn_idx": 0, "fc_round": 0, "tool_index": 0} + + assert bump_round() == 1 + assert bump_round() == 2 + assert bump_turn() == 1 + # bump_turn resets fc_round + state = get_state() + assert state["turn_idx"] == 1 + assert state["fc_round"] == 0 + assert next_tool_index() == 0 + assert next_tool_index() == 1 + finally: + reset_state(token) + + # After reset the state should be gone (None default). + assert get_state() is None + + +def test_context_propagating_executor_carries_contextvars(): + from opentelemetry.instrumentation.bfclv4.internal.threading_propagation import ( + ContextPropagatingExecutor, + ) + + cv: contextvars.ContextVar[str] = contextvars.ContextVar( + "bfclv4_test_cv", default="default" + ) + cv.set("from_main_thread") + + def _read(): + return cv.get() + + with ContextPropagatingExecutor(max_workers=2) as pool: + future = pool.submit(_read) + assert future.result() == "from_main_thread" + + +def test_extract_tool_name_and_arguments(): + from opentelemetry.instrumentation.bfclv4.internal.wrappers import ( + _extract_tool_arguments, + _extract_tool_name, + ) + + assert _extract_tool_name("calc.add(1, 2)") == "add" + assert _extract_tool_name("list_files()") == "list_files" + assert _extract_tool_name("not a call") == "unknown" + assert _extract_tool_arguments("foo(a=1, b=2)") == "a=1, b=2" + assert _extract_tool_arguments("foo()") is None + + +def test_infer_finish_reason_heuristic(): + from opentelemetry.instrumentation.bfclv4.internal.wrappers import ( + _infer_finish_reason, + ) + + assert _infer_finish_reason([]) == "empty_response" + assert _infer_finish_reason([[]]) == "empty_response" + assert _infer_finish_reason([{"name": "x"}]) == "tool_calls" + assert _infer_finish_reason("plain string") == "stop" + assert _infer_finish_reason(None) == "unknown" + + +def test_provider_mapping_without_bfcl(monkeypatch): + from opentelemetry.instrumentation.bfclv4.internal.provider import ( + infer_provider, + ) + + pytest.importorskip( + "opentelemetry.util.genai.extended_types", + ) + + class _Dummy: + model_style = None + + name, extras = infer_provider(_Dummy()) + # If bfcl-eval is not installed, ``ModelStyle`` import fails and we get + # ``unknown``; otherwise we still get ``unknown`` because ``model_style`` + # is None. + assert name == "unknown" + assert extras == {} From 3d08e03d0a8dd1c3e0566964b985a69c76908460 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 6 May 2026 20:05:03 +0800 Subject: [PATCH 2/3] feat: support bfclv4 --- .../instrumentation/bfclv4/__init__.py | 91 ++++++++++++------- .../bfclv4/internal/wrappers.py | 40 ++------ .../instrumentation/bfclv4/version.py | 2 +- 3 files changed, 67 insertions(+), 66 deletions(-) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/__init__.py index 34a5a9b10..6a7729940 100644 --- a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/__init__.py +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/__init__.py @@ -108,6 +108,8 @@ def __init__(self) -> None: self._inference_wrapped = False if not hasattr(self, "_tool_wrapped"): self._tool_wrapped = False + if not hasattr(self, "_tool_targets"): + self._tool_targets: List[Tuple[str, str]] = [] def instrumentation_dependencies(self) -> Collection[str]: return _instruments @@ -121,9 +123,9 @@ def _instrument(self, **kwargs: Any) -> None: # noqa: D401 # 1) ENTRY ----------------------------------------------------- try: wrap_function_wrapper( - module=_GENERATE_RESULTS_MODULE, - name=_GENERATE_RESULTS_NAME, - wrapper=GenerateResultsWrapper(helper), + _GENERATE_RESULTS_MODULE, + _GENERATE_RESULTS_NAME, + GenerateResultsWrapper(helper), ) self._entry_wrapped = True except Exception as exc: # noqa: BLE001 @@ -137,9 +139,9 @@ def _instrument(self, **kwargs: Any) -> None: # noqa: D401 # 2) AGENT ----------------------------------------------------- try: wrap_function_wrapper( - module=_BASE_HANDLER_MODULE, - name=_BASE_HANDLER_NAME, - wrapper=BaseHandlerInferenceWrapper(helper), + _BASE_HANDLER_MODULE, + _BASE_HANDLER_NAME, + BaseHandlerInferenceWrapper(helper), ) self._inference_wrapped = True except Exception as exc: # noqa: BLE001 @@ -154,20 +156,39 @@ def _instrument(self, **kwargs: Any) -> None: # noqa: D401 self._instrument_handlers(helper) # 5) TOOL ------------------------------------------------------ - try: - wrap_function_wrapper( - module=_EXECUTE_TOOL_MODULE, - name=_EXECUTE_TOOL_NAME, - wrapper=ExecuteFuncCallWrapper(helper), - ) - self._tool_wrapped = True - except Exception as exc: # noqa: BLE001 - logger.warning( - "bfclv4: failed to wrap %s.%s: %s", - _EXECUTE_TOOL_MODULE, + # ``execute_multi_turn_func_call`` is re-exported via ``from ... import`` + # in several BFCL modules, so wrapping just the source module misses + # the call sites that use the local binding. We wrap each known + # re-export site as well to guarantee the TOOL span is always emitted. + tool_targets = [ + (_EXECUTE_TOOL_MODULE, _EXECUTE_TOOL_NAME), + ( + "bfcl_eval.model_handler.base_handler", _EXECUTE_TOOL_NAME, - exc, - ) + ), + ( + "bfcl_eval.eval_checker.multi_turn_eval.multi_turn_checker", + _EXECUTE_TOOL_NAME, + ), + ] + wrapper_instance = ExecuteFuncCallWrapper(helper) + self._tool_targets = [] + for module_name, attr_name in tool_targets: + try: + wrap_function_wrapper( + module_name, + attr_name, + wrapper_instance, + ) + self._tool_targets.append((module_name, attr_name)) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to wrap %s.%s: %s", + module_name, + attr_name, + exc, + ) + self._tool_wrapped = bool(self._tool_targets) def _instrument_handlers(self, helper: GenAIHookHelper) -> None: # Reflectively wrap every concrete ``_query_FC`` / ``_query_prompting`` @@ -199,9 +220,9 @@ def _instrument_handlers(self, helper: GenAIHookHelper) -> None: seen_func_ids.add(key) try: wrap_function_wrapper( - module=cls.__module__, - name=f"{cls.__name__}.{method_name}", - wrapper=QueryWrapper(helper, mode), + cls.__module__, + f"{cls.__name__}.{method_name}", + QueryWrapper(helper, mode), ) self._wrapped_query_methods.append((cls, method_name)) except Exception as exc: # noqa: BLE001 @@ -223,9 +244,9 @@ def _instrument_handlers(self, helper: GenAIHookHelper) -> None: seen_func_ids.add(key) try: wrap_function_wrapper( - module=cls.__module__, - name=f"{cls.__name__}.{method_name}", - wrapper=TurnBumpWrapper(reset=is_first), + cls.__module__, + f"{cls.__name__}.{method_name}", + TurnBumpWrapper(reset=is_first), ) self._wrapped_turn_methods.append((cls, method_name)) except Exception as exc: # noqa: BLE001 @@ -242,14 +263,18 @@ def _instrument_handlers(self, helper: GenAIHookHelper) -> None: def _uninstrument(self, **kwargs: Any) -> None: # noqa: D401 if self._tool_wrapped: - try: - module = importlib.import_module(_EXECUTE_TOOL_MODULE) - unwrap(module, _EXECUTE_TOOL_NAME) - except Exception as exc: # noqa: BLE001 - logger.debug( - "bfclv4: failed to unwrap execute_multi_turn_func_call: %s", - exc, - ) + for module_name, attr_name in getattr(self, "_tool_targets", []): + try: + module = importlib.import_module(module_name) + unwrap(module, attr_name) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to unwrap %s.%s: %s", + module_name, + attr_name, + exc, + ) + self._tool_targets = [] self._tool_wrapped = False for cls, method_name in self._wrapped_query_methods: diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py index 9683cb85b..31106b9e4 100644 --- a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py @@ -435,40 +435,16 @@ def __call__(self, wrapped: Callable, instance: Any, args, kwargs): # noqa: D40 # Post-call attribute enrichment - use try/except so that any # vendor-side parsing surprise never breaks BFCL itself. + # + # IMPORTANT: We must NOT re-call ``_parse_query_response_*`` here, + # because for streaming providers (e.g. Qwen DashScope) the + # ``api_response`` is a single-pass generator that the parser + # consumes; calling it twice leaves BFCL's own subsequent call to + # the parser with an exhausted iterator, which crashes inference + # with ``UnboundLocalError: chunk``. Token usage will instead be + # recovered later from the AGENT-level metadata payload. try: if span is not None and span.is_recording(): - parser_name = ( - "_parse_query_response_FC" - if self._mode == "FC" - else "_parse_query_response_prompting" - ) - parser = getattr(instance, parser_name, None) - if parser is not None: - parsed = parser(api_response) - if isinstance(parsed, dict): - input_token = parsed.get("input_token") - output_token = parsed.get("output_token") - if isinstance(input_token, (int, float)): - span.set_attribute( - "gen_ai.usage.input_tokens", - int(input_token), - ) - if isinstance(output_token, (int, float)): - span.set_attribute( - "gen_ai.usage.output_tokens", - int(output_token), - ) - if isinstance(input_token, (int, float)) and isinstance( - output_token, (int, float) - ): - span.set_attribute( - "gen_ai.usage.total_tokens", - int(input_token) + int(output_token), - ) - model_resp = parsed.get("model_responses") - step_inv.finish_reason = _infer_finish_reason( - model_resp - ) if isinstance(query_latency, (int, float)): try: span.set_attribute( diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/version.py index 4effd145c..3263662eb 100644 --- a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/version.py +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.1.0.dev" +__version__ = "0.1.3.dev0" From 5cbd0490d441bb3ff08dc1172ecdd822aeb285bc Mon Sep 17 00:00:00 2001 From: musi Date: Wed, 6 May 2026 22:08:27 +0800 Subject: [PATCH 3/3] fix(bfclv4): keep STEP/LLM context detach in LIFO order for streaming responses When ``_query_FC`` / ``_query_prompting`` returns a streaming wrapper (e.g. ``openai-v2`` ``ChatStreamWrapper``), the LLM span and its OTel context attach are kept alive until the stream is consumed by BFCL's ``_parse_query_response_*`` after the STEP context manager has already exited. Non-LIFO context detach then leaves the prior LLM span as the "current" span, which causes subsequent STEP and TOOL spans to be parented under the previous STEP rather than under AGENT. Force-consume the streaming response inside the STEP context and replace it with a plain iterator over the cached chunks so that ``stop_llm`` (which detaches LLM context) runs in LIFO order before STEP detaches. --- .../bfclv4/internal/wrappers.py | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py index 31106b9e4..cb9bd3f34 100644 --- a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py @@ -433,6 +433,33 @@ def __call__(self, wrapped: Callable, instance: Any, args, kwargs): # noqa: D40 # inference: ..." result string at the AGENT layer. raise + # When the underlying handler returns a streaming wrapper + # (e.g. ``ChatStreamWrapper`` from openai-v2), the LLM span and + # its OTel context attach are kept alive until the stream is + # consumed by BFCL's ``_parse_query_response_*`` *outside* of + # this STEP context manager. That breaks the LIFO ordering of + # context attach/detach, leaving the LLM span as the "current" + # span after the STEP CM exits, which causes the next STEP and + # any TOOL spans to be parented to the previous STEP rather + # than to the AGENT. + # + # To preserve LIFO ordering, force-consume the stream here + # (inside the STEP context) and replace it with a plain + # iterator over the cached chunks. This makes ``stop_llm`` + # (which detaches the LLM context) run *before* STEP detaches. + if api_response is not None and hasattr( + api_response, "__next__" + ) and not isinstance(api_response, (str, bytes)): + try: + chunks = list(api_response) + api_response = iter(chunks) + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 STEP: failed to materialise streaming " + "response; LLM/STEP nesting may be incorrect", + exc_info=True, + ) + # Post-call attribute enrichment - use try/except so that any # vendor-side parsing surprise never breaks BFCL itself. #