alibaba · 123liuziming · May 7, 2026
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/README.md b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/README.md
@@ -0,0 +1,55 @@
+# LoongSuite WildToolBench Instrumentation
+
+OpenTelemetry instrumentation for the [WildToolBench](https://github.com/yupeijei1997/WildToolBench) benchmark framework.
+
+## Installation
+
+WildToolBench is not available on PyPI. Install it from source:
+
+```bash
+pip install -e /path/to/WildToolBench/wild-tool-bench
+pip install loongsuite-instrumentation-wildtool
+```
+
+## Requirements
+
+- **OpenAI provider instrumentation**: To produce LLM spans, you must also enable an OpenAI provider instrumentation (e.g., `opentelemetry-instrumentation-openai` or LoongSuite's equivalent). This plugin creates ENTRY/AGENT/CHAIN/STEP/TOOL spans but does **not** create LLM spans itself.
+
+## Usage
+
+```python
+from opentelemetry.instrumentation.wildtool import WildToolInstrumentor
+
+WildToolInstrumentor().instrument()
+
+# Run WildToolBench as usual — spans are automatically generated.
+```
+
+## Span Topology
+
+```
+ENTRY (enter_ai_application_system)
+└── AGENT (invoke_agent wildtool)
+    └── CHAIN (workflow task_{idx})
+        └── STEP (react step)
+            ├── [LLM span — provider instrumentation]
+            └── TOOL (execute_tool {tool_name})
+```
+
+## Patch Points
+
+| # | Target | Span Type |
+|---|--------|-----------|
+| P1 | `multi_threaded_inference` | ENTRY |
+| P2 | `BaseHandler.inference_multi_turn` | AGENT |
+| P3 | `BaseHandler.inference_and_eval_multi_step` | CHAIN + TOOL |
+| P4 | `BaseHandler._request_tool_call` | STEP |
+| P5 | `BaseHandler._parse_api_response` | (token extraction) |
+
+## Round 2 fixes (see `llm-dev/execute.md` § "修订记录 (Round 2 fix)")
+
+- **H1**: TOOL span is now parented on STEP, not CHAIN. Strategy A enhanced — the chain wrapper holds a `round → STEP span` map and uses `trace.set_span_in_context(step_span)` to anchor each post-hoc TOOL span on the matching STEP. STEP `SpanContext`s remain valid parents even after `end()`.
+- **H2 (provider-name fallback)**: `opentelemetry-instrumentation-openai-v2 == 0.62b1` only emits the legacy `gen_ai.system` attribute on its LLM span; the new `gen_ai.provider.name` attribute is missing. As a *pure fallback* the wildtool plugin writes both `gen_ai.system="openai"` and `gen_ai.provider.name="openai"` on the **STEP** span (not on the LLM span — that is owned by the OpenAI v2 instrumentation and we do **not** patch it). Once the OpenAI v2 instrumentation upstream emits `gen_ai.provider.name` natively this fallback can be removed.
+- **M1**: CHAIN span now carries `input.value` (last user message in `inference_data["messages"]`, truncated to 4096 chars) and `output.value` (JSON of `action_name_label`/`task_idx`/`is_optimal`).
+- **M2**: STEP span now carries `gen_ai.react.finish_reason` on error paths. Mapping table is in `execute.md` § "M2: gen_ai.react.finish_reason 取值映射".
+- **M3**: TOOL span explicitly writes `gen_ai.tool.call.arguments` / `gen_ai.tool.call.result` / `gen_ai.tool.description`, bypassing `OTEL_INSTRUMENTATION_GENAI_CAPTURE_*` gating in `opentelemetry-util-genai`. The custom `wildtool.tool.execution_mode = "ground_truth_replay"` is preserved.
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/pyproject.toml
@@ -0,0 +1,66 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "loongsuite-instrumentation-wildtool"
+dynamic = ["version"]
+description = "LoongSuite WildToolBench Instrumentation"
+readme = "README.md"
+license = "Apache-2.0"
+requires-python = ">=3.9"
+authors = [
+    { name = "LoongSuite Python Agent Authors", email = "caishipeng.csp@alibaba-inc.com" },
+    { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" },
+]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+]
+dependencies = [
+    "opentelemetry-api ~= 1.37",
+    "opentelemetry-instrumentation >= 0.58b0",
+    "opentelemetry-semantic-conventions >= 0.58b0",
+    "opentelemetry-util-genai",
+    "wrapt >= 1.17.3, < 3.0.0",
+]
+
+[project.optional-dependencies]
+instruments = [
+    "openai >= 1.0.0",
+]
+
+test = [
+    "pytest ~= 8.0",
+    "pytest-cov ~= 4.1.0",
+    "pytest-forked >= 1.6.0",
+    "opentelemetry-sdk >= 1.37",
+    "openai >= 1.0.0",
+]
+
+[project.entry-points.opentelemetry_instrumentor]
+wildtool = "opentelemetry.instrumentation.wildtool:WildToolInstrumentor"
+
+[project.urls]
+Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-wildtool"
+Repository = "https://github.com/alibaba/loongsuite-python-agent"
+
+[tool.hatch.version]
+path = "src/opentelemetry/instrumentation/wildtool/version.py"
+
+[tool.hatch.build.targets.sdist]
+include = [
+    "/src",
+    "/tests",
+]
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/opentelemetry"]
diff --git a/...oongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/__init__.py b/...oongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/__init__.py
@@ -0,0 +1,161 @@
+"""OpenTelemetry WildToolBench Instrumentation"""
+
+import logging
+from typing import Any, Collection
+
+from wrapt import wrap_function_wrapper
+
+from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
+from opentelemetry.instrumentation.utils import unwrap
+from opentelemetry.instrumentation.wildtool.package import _instruments
+from opentelemetry.instrumentation.wildtool.version import __version__
+from opentelemetry.instrumentation.wildtool._wrappers import (
+    WildToolAgentWrapper,
+    WildToolChainWrapper,
+    WildToolEntryWrapper,
+    WildToolParseWrapper,
+    WildToolRequestWrapper,
+)
+from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler
+
+logger = logging.getLogger(__name__)
+
+_LLM_RESPONSE_GEN_MODULE = "wtb._llm_response_generation"
+_BASE_HANDLER_MODULE = "wtb.model_handler.base_handler"
+
+__all__ = ["WildToolInstrumentor", "__version__"]
+
+
+class WildToolInstrumentor(BaseInstrumentor):
+    """OpenTelemetry instrumentor for WildToolBench framework."""
+
+    def __init__(self):
+        super().__init__()
+        self._handler = None
+        # Track concrete handler subclasses whose abstract _request_tool_call /
+        # _parse_api_response we have already wrapped, so we can unwrap on
+        # uninstrument and avoid double-wrapping.
+        self._patched_handler_classes: set = set()
+        self._request_wrapper = None
+        self._parse_wrapper = None
+
+    def instrumentation_dependencies(self) -> Collection[str]:
+        return _instruments
+
+    def _instrument(self, **kwargs: Any) -> None:
+        tracer_provider = kwargs.get("tracer_provider")
+        meter_provider = kwargs.get("meter_provider")
+        logger_provider = kwargs.get("logger_provider")
+
+        self._handler = ExtendedTelemetryHandler(
+            tracer_provider=tracer_provider,
+            meter_provider=meter_provider,
+            logger_provider=logger_provider,
+        )
+        self._request_wrapper = WildToolRequestWrapper(self._handler)
+        self._parse_wrapper = WildToolParseWrapper(self._handler)
+
+        # P1: ENTRY span
+        try:
+            wrap_function_wrapper(
+                _LLM_RESPONSE_GEN_MODULE,
+                "multi_threaded_inference",
+                WildToolEntryWrapper(self._handler),
+            )
+        except Exception as e:
+            logger.warning("Failed to instrument multi_threaded_inference: %s", e)
+
+        # P2: AGENT span
+        try:
+            wrap_function_wrapper(
+                _BASE_HANDLER_MODULE,
+                "BaseHandler.inference_multi_turn",
+                WildToolAgentWrapper(self._handler),
+            )
+        except Exception as e:
+            logger.warning("Failed to instrument inference_multi_turn: %s", e)
+
+        # P3: CHAIN span (+ STEP + TOOL management).
+        # The chain wrapper also lazily patches the concrete subclass'
+        # `_request_tool_call` / `_parse_api_response` on first use, so that
+        # subclasses overriding the abstract base methods are still
+        # intercepted (P4 / P5).
+        try:
+            wrap_function_wrapper(
+                _BASE_HANDLER_MODULE,
+                "BaseHandler.inference_and_eval_multi_step",
+                WildToolChainWrapper(self._handler, self),
+            )
+        except Exception as e:
+            logger.warning(
+                "Failed to instrument inference_and_eval_multi_step: %s", e
+            )
+
+    def ensure_handler_class_patched(self, handler_cls) -> None:
+        """Lazily wrap the concrete handler subclass' P4/P5 methods.
+
+        WildToolBench declares ``_request_tool_call`` and ``_parse_api_response``
+        as abstract on ``BaseHandler``, but real handlers (and tests) override
+        them. Python method resolution dispatches directly to the override and
+        therefore never reaches a wrapper installed on the base class. We
+        instead wrap the override on first invocation per subclass.
+        """
+        if handler_cls in self._patched_handler_classes:
+            return
+        self._patched_handler_classes.add(handler_cls)
+
+        module_name = handler_cls.__module__
+        cls_name = handler_cls.__name__
+        for method, wrapper in (
+            ("_request_tool_call", self._request_wrapper),
+            ("_parse_api_response", self._parse_wrapper),
+        ):
+            if method not in handler_cls.__dict__:
+                continue
+            try:
+                wrap_function_wrapper(
+                    module_name,
+                    f"{cls_name}.{method}",
+                    wrapper,
+                )
+            except Exception as e:
+                logger.debug(
+                    "Failed to wrap %s.%s.%s: %s",
+                    module_name,
+                    cls_name,
+                    method,
+                    e,
+                )
+
+    def _uninstrument(self, **kwargs: Any) -> None:
+        try:
+            import wtb._llm_response_generation as llm_gen
+
+            unwrap(llm_gen, "multi_threaded_inference")
+        except Exception as e:
+            logger.debug("Failed to uninstrument multi_threaded_inference: %s", e)
+
+        try:
+            import wtb.model_handler.base_handler as bh
+
+            unwrap(bh.BaseHandler, "inference_multi_turn")
+            unwrap(bh.BaseHandler, "inference_and_eval_multi_step")
+        except Exception as e:
+            logger.debug("Failed to uninstrument BaseHandler methods: %s", e)
+
+        for cls in list(self._patched_handler_classes):
+            for method in ("_request_tool_call", "_parse_api_response"):
+                if method in cls.__dict__:
+                    try:
+                        unwrap(cls, method)
+                    except Exception as e:
+                        logger.debug(
+                            "Failed to unwrap %s.%s: %s",
+                            cls.__name__,
+                            method,
+                            e,
+                        )
+        self._patched_handler_classes.clear()
+        self._request_wrapper = None
+        self._parse_wrapper = None
+        self._handler = None