From b7e7a4b0bcfc507c94858c4f17fc717eee6437ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=89=A7=E6=80=9D?= <liuziming.lzm@alibaba-inc.com>
Date: Thu, 7 May 2026 09:32:45 +0800
Subject: [PATCH] feat: support wild-tool

Change-Id: I0da98161cbdbe6a51b963bcc19f45a3d2d977968
---
 .../README.md                                 |  55 ++
 .../pyproject.toml                            |  66 ++
 .../instrumentation/wildtool/__init__.py      | 161 +++++
 .../instrumentation/wildtool/_wrappers.py     | 644 ++++++++++++++++++
 .../instrumentation/wildtool/package.py       |   2 +
 .../instrumentation/wildtool/utils.py         |  17 +
 .../instrumentation/wildtool/version.py       |   1 +
 .../tests/__init__.py                         |   0
 .../tests/conftest.py                         | 182 +++++
 .../tests/test_agent_span.py                  | 108 +++
 .../tests/test_chain_step_tool_spans.py       | 283 ++++++++
 .../tests/test_entry_span.py                  | 115 ++++
 .../tests/test_error_scenarios.py             | 135 ++++
 .../tests/test_instrumentor.py                |  20 +
 .../tests/test_round2_fixes.py                | 441 ++++++++++++
 15 files changed, 2230 insertions(+)
 create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/README.md
 create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/pyproject.toml
 create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/__init__.py
 create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/_wrappers.py
 create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/package.py
 create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/utils.py
 create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/version.py
 create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/__init__.py
 create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/conftest.py
 create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_agent_span.py
 create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_chain_step_tool_spans.py
 create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_entry_span.py
 create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_error_scenarios.py
 create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_instrumentor.py
 create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_round2_fixes.py

diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/README.md b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/README.md
new file mode 100644
index 000000000..1b0499fa4
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/README.md
@@ -0,0 +1,55 @@
+# LoongSuite WildToolBench Instrumentation
+
+OpenTelemetry instrumentation for the [WildToolBench](https://github.com/yupeijei1997/WildToolBench) benchmark framework.
+
+## Installation
+
+WildToolBench is not available on PyPI. Install it from source:
+
+```bash
+pip install -e /path/to/WildToolBench/wild-tool-bench
+pip install loongsuite-instrumentation-wildtool
+```
+
+## Requirements
+
+- **OpenAI provider instrumentation**: To produce LLM spans, you must also enable an OpenAI provider instrumentation (e.g., `opentelemetry-instrumentation-openai` or LoongSuite's equivalent). This plugin creates ENTRY/AGENT/CHAIN/STEP/TOOL spans but does **not** create LLM spans itself.
+
+## Usage
+
+```python
+from opentelemetry.instrumentation.wildtool import WildToolInstrumentor
+
+WildToolInstrumentor().instrument()
+
+# Run WildToolBench as usual — spans are automatically generated.
+```
+
+## Span Topology
+
+```
+ENTRY (enter_ai_application_system)
+└── AGENT (invoke_agent wildtool)
+    └── CHAIN (workflow task_{idx})
+        └── STEP (react step)
+            ├── [LLM span — provider instrumentation]
+            └── TOOL (execute_tool {tool_name})
+```
+
+## Patch Points
+
+| # | Target | Span Type |
+|---|--------|-----------|
+| P1 | `multi_threaded_inference` | ENTRY |
+| P2 | `BaseHandler.inference_multi_turn` | AGENT |
+| P3 | `BaseHandler.inference_and_eval_multi_step` | CHAIN + TOOL |
+| P4 | `BaseHandler._request_tool_call` | STEP |
+| P5 | `BaseHandler._parse_api_response` | (token extraction) |
+
+## Round 2 fixes (see `llm-dev/execute.md` § "修订记录 (Round 2 fix)")
+
+- **H1**: TOOL span is now parented on STEP, not CHAIN. Strategy A enhanced — the chain wrapper holds a `round → STEP span` map and uses `trace.set_span_in_context(step_span)` to anchor each post-hoc TOOL span on the matching STEP. STEP `SpanContext`s remain valid parents even after `end()`.
+- **H2 (provider-name fallback)**: `opentelemetry-instrumentation-openai-v2 == 0.62b1` only emits the legacy `gen_ai.system` attribute on its LLM span; the new `gen_ai.provider.name` attribute is missing. As a *pure fallback* the wildtool plugin writes both `gen_ai.system="openai"` and `gen_ai.provider.name="openai"` on the **STEP** span (not on the LLM span — that is owned by the OpenAI v2 instrumentation and we do **not** patch it). Once the OpenAI v2 instrumentation upstream emits `gen_ai.provider.name` natively this fallback can be removed.
+- **M1**: CHAIN span now carries `input.value` (last user message in `inference_data["messages"]`, truncated to 4096 chars) and `output.value` (JSON of `action_name_label`/`task_idx`/`is_optimal`).
+- **M2**: STEP span now carries `gen_ai.react.finish_reason` on error paths. Mapping table is in `execute.md` § "M2: gen_ai.react.finish_reason 取值映射".
+- **M3**: TOOL span explicitly writes `gen_ai.tool.call.arguments` / `gen_ai.tool.call.result` / `gen_ai.tool.description`, bypassing `OTEL_INSTRUMENTATION_GENAI_CAPTURE_*` gating in `opentelemetry-util-genai`. The custom `wildtool.tool.execution_mode = "ground_truth_replay"` is preserved.
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/pyproject.toml
new file mode 100644
index 000000000..b8f9f44d0
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/pyproject.toml
@@ -0,0 +1,66 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "loongsuite-instrumentation-wildtool"
+dynamic = ["version"]
+description = "LoongSuite WildToolBench Instrumentation"
+readme = "README.md"
+license = "Apache-2.0"
+requires-python = ">=3.9"
+authors = [
+    { name = "LoongSuite Python Agent Authors", email = "caishipeng.csp@alibaba-inc.com" },
+    { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" },
+]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+]
+dependencies = [
+    "opentelemetry-api ~= 1.37",
+    "opentelemetry-instrumentation >= 0.58b0",
+    "opentelemetry-semantic-conventions >= 0.58b0",
+    "opentelemetry-util-genai",
+    "wrapt >= 1.17.3, < 3.0.0",
+]
+
+[project.optional-dependencies]
+instruments = [
+    "openai >= 1.0.0",
+]
+
+test = [
+    "pytest ~= 8.0",
+    "pytest-cov ~= 4.1.0",
+    "pytest-forked >= 1.6.0",
+    "opentelemetry-sdk >= 1.37",
+    "openai >= 1.0.0",
+]
+
+[project.entry-points.opentelemetry_instrumentor]
+wildtool = "opentelemetry.instrumentation.wildtool:WildToolInstrumentor"
+
+[project.urls]
+Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-wildtool"
+Repository = "https://github.com/alibaba/loongsuite-python-agent"
+
+[tool.hatch.version]
+path = "src/opentelemetry/instrumentation/wildtool/version.py"
+
+[tool.hatch.build.targets.sdist]
+include = [
+    "/src",
+    "/tests",
+]
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/opentelemetry"]
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/__init__.py
new file mode 100644
index 000000000..dad772500
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/__init__.py
@@ -0,0 +1,161 @@
+"""OpenTelemetry WildToolBench Instrumentation"""
+
+import logging
+from typing import Any, Collection
+
+from wrapt import wrap_function_wrapper
+
+from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
+from opentelemetry.instrumentation.utils import unwrap
+from opentelemetry.instrumentation.wildtool.package import _instruments
+from opentelemetry.instrumentation.wildtool.version import __version__
+from opentelemetry.instrumentation.wildtool._wrappers import (
+    WildToolAgentWrapper,
+    WildToolChainWrapper,
+    WildToolEntryWrapper,
+    WildToolParseWrapper,
+    WildToolRequestWrapper,
+)
+from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler
+
+logger = logging.getLogger(__name__)
+
+_LLM_RESPONSE_GEN_MODULE = "wtb._llm_response_generation"
+_BASE_HANDLER_MODULE = "wtb.model_handler.base_handler"
+
+__all__ = ["WildToolInstrumentor", "__version__"]
+
+
+class WildToolInstrumentor(BaseInstrumentor):
+    """OpenTelemetry instrumentor for WildToolBench framework."""
+
+    def __init__(self):
+        super().__init__()
+        self._handler = None
+        # Track concrete handler subclasses whose abstract _request_tool_call /
+        # _parse_api_response we have already wrapped, so we can unwrap on
+        # uninstrument and avoid double-wrapping.
+        self._patched_handler_classes: set = set()
+        self._request_wrapper = None
+        self._parse_wrapper = None
+
+    def instrumentation_dependencies(self) -> Collection[str]:
+        return _instruments
+
+    def _instrument(self, **kwargs: Any) -> None:
+        tracer_provider = kwargs.get("tracer_provider")
+        meter_provider = kwargs.get("meter_provider")
+        logger_provider = kwargs.get("logger_provider")
+
+        self._handler = ExtendedTelemetryHandler(
+            tracer_provider=tracer_provider,
+            meter_provider=meter_provider,
+            logger_provider=logger_provider,
+        )
+        self._request_wrapper = WildToolRequestWrapper(self._handler)
+        self._parse_wrapper = WildToolParseWrapper(self._handler)
+
+        # P1: ENTRY span
+        try:
+            wrap_function_wrapper(
+                _LLM_RESPONSE_GEN_MODULE,
+                "multi_threaded_inference",
+                WildToolEntryWrapper(self._handler),
+            )
+        except Exception as e:
+            logger.warning("Failed to instrument multi_threaded_inference: %s", e)
+
+        # P2: AGENT span
+        try:
+            wrap_function_wrapper(
+                _BASE_HANDLER_MODULE,
+                "BaseHandler.inference_multi_turn",
+                WildToolAgentWrapper(self._handler),
+            )
+        except Exception as e:
+            logger.warning("Failed to instrument inference_multi_turn: %s", e)
+
+        # P3: CHAIN span (+ STEP + TOOL management).
+        # The chain wrapper also lazily patches the concrete subclass'
+        # `_request_tool_call` / `_parse_api_response` on first use, so that
+        # subclasses overriding the abstract base methods are still
+        # intercepted (P4 / P5).
+        try:
+            wrap_function_wrapper(
+                _BASE_HANDLER_MODULE,
+                "BaseHandler.inference_and_eval_multi_step",
+                WildToolChainWrapper(self._handler, self),
+            )
+        except Exception as e:
+            logger.warning(
+                "Failed to instrument inference_and_eval_multi_step: %s", e
+            )
+
+    def ensure_handler_class_patched(self, handler_cls) -> None:
+        """Lazily wrap the concrete handler subclass' P4/P5 methods.
+
+        WildToolBench declares ``_request_tool_call`` and ``_parse_api_response``
+        as abstract on ``BaseHandler``, but real handlers (and tests) override
+        them. Python method resolution dispatches directly to the override and
+        therefore never reaches a wrapper installed on the base class. We
+        instead wrap the override on first invocation per subclass.
+        """
+        if handler_cls in self._patched_handler_classes:
+            return
+        self._patched_handler_classes.add(handler_cls)
+
+        module_name = handler_cls.__module__
+        cls_name = handler_cls.__name__
+        for method, wrapper in (
+            ("_request_tool_call", self._request_wrapper),
+            ("_parse_api_response", self._parse_wrapper),
+        ):
+            if method not in handler_cls.__dict__:
+                continue
+            try:
+                wrap_function_wrapper(
+                    module_name,
+                    f"{cls_name}.{method}",
+                    wrapper,
+                )
+            except Exception as e:
+                logger.debug(
+                    "Failed to wrap %s.%s.%s: %s",
+                    module_name,
+                    cls_name,
+                    method,
+                    e,
+                )
+
+    def _uninstrument(self, **kwargs: Any) -> None:
+        try:
+            import wtb._llm_response_generation as llm_gen
+
+            unwrap(llm_gen, "multi_threaded_inference")
+        except Exception as e:
+            logger.debug("Failed to uninstrument multi_threaded_inference: %s", e)
+
+        try:
+            import wtb.model_handler.base_handler as bh
+
+            unwrap(bh.BaseHandler, "inference_multi_turn")
+            unwrap(bh.BaseHandler, "inference_and_eval_multi_step")
+        except Exception as e:
+            logger.debug("Failed to uninstrument BaseHandler methods: %s", e)
+
+        for cls in list(self._patched_handler_classes):
+            for method in ("_request_tool_call", "_parse_api_response"):
+                if method in cls.__dict__:
+                    try:
+                        unwrap(cls, method)
+                    except Exception as e:
+                        logger.debug(
+                            "Failed to unwrap %s.%s: %s",
+                            cls.__name__,
+                            method,
+                            e,
+                        )
+        self._patched_handler_classes.clear()
+        self._request_wrapper = None
+        self._parse_wrapper = None
+        self._handler = None
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/_wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/_wrappers.py
new file mode 100644
index 000000000..612a332ab
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/_wrappers.py
@@ -0,0 +1,644 @@
+"""Wrapper classes for WildToolBench instrumentation.
+
+Each wrapper corresponds to one patch point and manages the lifecycle
+of one or more span types.
+
+Round 2 fix highlights (see ``llm-dev/execute.md`` § "修订记录 (Round 2 fix)"):
+
+H1
+    TOOL span parent is now STEP rather than CHAIN. Each STEP invocation is
+    appended to a per-chain list in :data:`_chain_step_invocations`; when the
+    chain wrapper post-processes ``inference_log`` it looks up the matching
+    STEP span by ``round`` and uses
+    :func:`opentelemetry.trace.set_span_in_context` so ``start_execute_tool``
+    parents the TOOL span on the STEP context (even if STEP is already
+    closed — its :class:`SpanContext` remains a valid parent reference).
+
+H2
+    The OpenAI v2 provider instrumentation (0.62b1) writes only the legacy
+    ``gen_ai.system`` attribute to its LLM span. The wildtool plugin now
+    writes both ``gen_ai.system`` and ``gen_ai.provider.name`` on the STEP
+    span as a fallback so the new semantic-conventions attribute is present
+    in the trace tree even before the upstream OpenAI v2 instrumentation
+    catches up. We do **not** patch the OpenAI v2 instrumentation itself.
+
+M1
+    ``input.value`` (last user message in the chain's ``messages``, truncated
+    to 4096 chars) and ``output.value`` (a JSON of action label, task index
+    and is_optimal) are written on the CHAIN span.
+
+M2
+    ``gen_ai.react.finish_reason`` is derived from ``inference_log`` on the
+    *last* (currently active) STEP. Mappings:
+
+    ``"parse_tool_calls_failed"``
+        ``error_reason`` contains "parse tool_calls failed".
+    ``"action_name_mismatch"``
+        ``error_reason`` contains "action name not in candidate".
+    ``"empty_response"``
+        ``error_reason`` contains "tool_calls and content are None".
+    ``"error"``
+        request raised an exception (handled in
+        :class:`WildToolRequestWrapper`).
+
+M3
+    ``gen_ai.tool.call.arguments``, ``gen_ai.tool.call.result`` and
+    ``gen_ai.tool.description`` are written explicitly on TOOL spans
+    *before* close as a fallback. ``opentelemetry-util-genai`` gates these
+    sensitive attributes behind ``OTEL_INSTRUMENTATION_GENAI_CAPTURE_*`` env
+    vars; the wildtool plugin always writes them since wtb data is
+    benchmark-synthetic and never PII.
+"""
+
+import json
+import logging
+from contextvars import ContextVar
+from typing import List, Optional
+
+from opentelemetry.trace import StatusCode, set_span_in_context
+from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler
+from opentelemetry.util.genai.extended_types import (
+    EntryInvocation,
+    ExecuteToolInvocation,
+    InvokeAgentInvocation,
+    ReactStepInvocation,
+)
+from opentelemetry.util.genai.types import Error
+
+logger = logging.getLogger(__name__)
+
+# ─────────────────────────── ContextVars ───────────────────────────────
+# The CHAIN wrapper opens a new logical "chain" by flipping ``_in_chain``
+# and resetting the counter. The REQUEST wrapper reads these to decide
+# whether to create a STEP span and what round number to assign.
+_in_chain: ContextVar[bool] = ContextVar("_wt_in_chain", default=False)
+
+# Currently open STEP invocation. Used by the parse wrapper to attach
+# token attributes to the right span.
+_step_invocation: ContextVar[Optional[ReactStepInvocation]] = ContextVar(
+    "_wt_step_inv", default=None
+)
+_step_counter: ContextVar[int] = ContextVar("_wt_step_ctr", default=0)
+
+# Per-chain list of every STEP invocation created in the current chain
+# (in `round` order). The chain wrapper allocates this list on entry and
+# uses it after ``wrapped`` returns to re-parent TOOL spans onto the
+# matching STEP. Even if a STEP span is already ``end()``-ed, its
+# :class:`SpanContext` stays valid as a parent reference for new spans.
+_chain_step_invocations: ContextVar[Optional[List[ReactStepInvocation]]] = (
+    ContextVar("_wt_chain_step_invs", default=None)
+)
+
+_PROVIDER_FALLBACK_NAME = "openai"
+_INPUT_VALUE_MAX_CHARS = 4096
+
+
+def _close_active_step(handler: ExtendedTelemetryHandler) -> None:
+    """Close the currently active STEP span, if any."""
+    prev = _step_invocation.get()
+    if prev is not None:
+        try:
+            handler.stop_react_step(prev)
+        except Exception as e:  # noqa: BLE001
+            logger.debug("Failed to close step: %s", e)
+        _step_invocation.set(None)
+
+
+def _truncate(text: str, max_chars: int) -> str:
+    if len(text) <= max_chars:
+        return text
+    return text[:max_chars] + "...(truncated)"
+
+
+def _stringify(value) -> str:
+    if isinstance(value, str):
+        return value
+    try:
+        return json.dumps(value, ensure_ascii=False)
+    except (TypeError, ValueError):
+        return str(value)
+
+
+class WildToolEntryWrapper:
+    """P1: Wraps multi_threaded_inference → ENTRY span."""
+
+    def __init__(self, handler: ExtendedTelemetryHandler):
+        self._handler = handler
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        # Signature: multi_threaded_inference(handler, model_name, test_case).
+        # We only need model_name and test_case for ENTRY attributes; the
+        # handler instance flows through as args[0] untouched.
+        model_name = args[1] if len(args) > 1 else kwargs.get("model_name", "")
+        test_case = args[2] if len(args) > 2 else kwargs.get("test_case", {})
+
+        invocation = EntryInvocation(
+            session_id=test_case.get("id"),
+            attributes={
+                "gen_ai.framework": "wildtool",
+                "gen_ai.request.model": model_name,
+                "wildtool.turn_count": len(test_case.get("english_tasks", [])),
+            },
+        )
+        self._handler.start_entry(invocation)
+        try:
+            result = wrapped(*args, **kwargs)
+            self._handler.stop_entry(invocation)
+            return result
+        except Exception as e:
+            self._handler.fail_entry(
+                invocation, Error(message=str(e), type=type(e))
+            )
+            raise
+
+
+class WildToolAgentWrapper:
+    """P2: Wraps BaseHandler.inference_multi_turn → AGENT span."""
+
+    def __init__(self, handler: ExtendedTelemetryHandler):
+        self._handler = handler
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        test_entry = args[0] if args else kwargs.get("test_entry", {})
+
+        invocation = InvokeAgentInvocation(
+            provider=None,
+            agent_name=type(instance).__name__,
+            conversation_id=test_entry.get("id"),
+            request_model=getattr(instance, "model_name", None),
+            attributes={
+                "gen_ai.framework": "wildtool",
+                "wildtool.turn_count": len(
+                    test_entry.get("english_answer_list", [])
+                ),
+            },
+        )
+        self._handler.start_invoke_agent(invocation)
+        try:
+            result = wrapped(*args, **kwargs)
+            total_input = 0
+            total_output = 0
+            for task_result in (result or []):
+                if isinstance(task_result, dict):
+                    total_input += sum(
+                        task_result.get("input_token_count", [])
+                    )
+                    total_output += sum(
+                        task_result.get("output_token_count", [])
+                    )
+            if total_input:
+                invocation.input_tokens = total_input
+            if total_output:
+                invocation.output_tokens = total_output
+            self._handler.stop_invoke_agent(invocation)
+            return result
+        except Exception as e:
+            self._handler.fail_invoke_agent(
+                invocation, Error(message=str(e), type=type(e))
+            )
+            raise
+
+
+class WildToolChainWrapper:
+    """P3: Wraps BaseHandler.inference_and_eval_multi_step → CHAIN span.
+
+    Also manages the lifecycle of the final STEP span and creates TOOL spans
+    from the returned ``inference_log`` after the original function completes.
+    Round 2 fixes (H1/M1/M2/M3) are implemented here.
+    """
+
+    def __init__(self, handler: ExtendedTelemetryHandler, instrumentor=None):
+        self._handler = handler
+        self._instrumentor = instrumentor
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        if self._instrumentor is not None and instance is not None:
+            try:
+                self._instrumentor.ensure_handler_class_patched(type(instance))
+            except Exception as e:  # noqa: BLE001
+                logger.debug("Failed to ensure subclass patched: %s", e)
+
+        inference_data = args[0] if args else kwargs.get("inference_data", {})
+        if not isinstance(inference_data, dict):
+            inference_data = {}
+        task_idx = inference_data.get("task_idx", 0)
+        test_entry_id = inference_data.get("test_entry_id", "")
+
+        span_name = f"workflow task_{task_idx}"
+        tracer = self._handler._tracer
+
+        chain_token = _in_chain.set(True)
+        counter_token = _step_counter.set(0)
+        step_token = _step_invocation.set(None)
+        chain_steps: List[ReactStepInvocation] = []
+        chain_steps_token = _chain_step_invocations.set(chain_steps)
+
+        chain_attributes = {
+            "gen_ai.span.kind": "CHAIN",
+            "gen_ai.operation.name": "workflow",
+            "gen_ai.framework": "wildtool",
+            "wildtool.task_idx": task_idx,
+            "wildtool.test_entry_id": test_entry_id,
+        }
+
+        # M1: Capture last user message as ``input.value`` BEFORE running the
+        # wrapped function (the wtb function mutates ``messages`` in place).
+        input_value = self._extract_input_value(inference_data)
+        if input_value is not None:
+            chain_attributes["input.value"] = input_value
+
+        with tracer.start_as_current_span(
+            name=span_name, attributes=chain_attributes
+        ) as span:
+            try:
+                result = wrapped(*args, **kwargs)
+
+                # M2: Set finish_reason on the currently active (last) STEP
+                # BEFORE we close it. Only the terminal step ever carries an
+                # error finish_reason (every wtb error path triggers `break`).
+                if isinstance(result, dict):
+                    self._apply_last_step_finish_reason(
+                        result.get("inference_log", {})
+                    )
+
+                _close_active_step(self._handler)
+
+                if isinstance(result, dict):
+                    label = result.get("action_name_label", "")
+                    is_optimal = bool(result.get("is_optimal", False))
+                    span.set_attribute("wildtool.action_name_label", label)
+                    span.set_attribute("wildtool.is_optimal", is_optimal)
+
+                    # M1: ``output.value`` summarising chain outcome.
+                    try:
+                        span.set_attribute(
+                            "output.value",
+                            json.dumps(
+                                {
+                                    "action_name_label": label,
+                                    "task_idx": task_idx,
+                                    "is_optimal": is_optimal,
+                                },
+                                ensure_ascii=False,
+                            ),
+                        )
+                    except Exception as e:  # noqa: BLE001
+                        logger.debug("Failed to set output.value: %s", e)
+
+                    # H1 + M3: re-parent TOOL spans on STEP and force-write
+                    # tool call sensitive attributes.
+                    self._create_tool_spans_from_log(
+                        result.get("inference_log", {}),
+                        inference_data,
+                        chain_steps,
+                    )
+
+                span.set_status(StatusCode.OK)
+                return result
+            except Exception as e:
+                _close_active_step(self._handler)
+                span.record_exception(e)
+                span.set_status(StatusCode.ERROR)
+                raise
+            finally:
+                _chain_step_invocations.reset(chain_steps_token)
+                _step_counter.reset(counter_token)
+                _step_invocation.reset(step_token)
+                _in_chain.reset(chain_token)
+
+    # -- M1 ---------------------------------------------------------------
+
+    @staticmethod
+    def _extract_input_value(inference_data) -> Optional[str]:
+        msgs = inference_data.get("messages") if isinstance(
+            inference_data, dict
+        ) else None
+        if not isinstance(msgs, list):
+            return None
+        for m in reversed(msgs):
+            if not isinstance(m, dict) or m.get("role") != "user":
+                continue
+            content = m.get("content")
+            if content is None:
+                continue
+            text = _stringify(content)
+            return _truncate(text, _INPUT_VALUE_MAX_CHARS)
+        return None
+
+    # -- M2 ---------------------------------------------------------------
+
+    def _apply_last_step_finish_reason(self, inference_log) -> None:
+        if not isinstance(inference_log, dict):
+            return
+        current_step = _step_invocation.get()
+        if current_step is None or current_step.round is None:
+            return
+        step_key = f"step_{current_step.round - 1}"
+        step_data = inference_log.get(step_key)
+        if not isinstance(step_data, dict):
+            return
+        output = step_data.get("inference_output") or {}
+        if not isinstance(output, dict):
+            return
+        label = output.get("current_action_name_label")
+        error_reason = output.get("error_reason") or ""
+        reason = self._derive_step_finish_reason(label, error_reason)
+        if reason is None:
+            return
+        # Setting `invocation.finish_reason` is enough — the util-genai
+        # `_apply_react_step_finish_attributes` writes
+        # ``gen_ai.react.finish_reason`` from this field on stop.
+        current_step.finish_reason = reason
+
+    @staticmethod
+    def _derive_step_finish_reason(
+        label, error_reason: str
+    ) -> Optional[str]:
+        """Map wtb inference_log error_reason → gen_ai.react.finish_reason."""
+        if label != "error":
+            return None
+        if "parse tool_calls failed" in error_reason:
+            return "parse_tool_calls_failed"
+        if "action name not in candidate" in error_reason:
+            return "action_name_mismatch"
+        if "tool_calls and content are None" in error_reason:
+            return "empty_response"
+        return "error"
+
+    # -- H1 + M3 ----------------------------------------------------------
+
+    def _create_tool_spans_from_log(
+        self,
+        inference_log,
+        inference_data,
+        chain_steps: List[ReactStepInvocation],
+    ) -> None:
+        """Post-hoc TOOL span creation from inference_log.
+
+        Uses the per-chain STEP invocation list to parent each TOOL span on
+        the matching STEP span (H1).  Sensitive tool-call attributes are
+        written explicitly on the span (M3) so they appear regardless of
+        ``OTEL_INSTRUMENTATION_GENAI_CAPTURE_*`` settings.
+        """
+        if not isinstance(inference_log, dict):
+            return
+
+        # round → SpanContext-bearing OTel context for parenting
+        step_ctx_by_round = {}
+        for step_inv in chain_steps:
+            if step_inv.round is None or step_inv.span is None:
+                continue
+            try:
+                step_ctx_by_round[step_inv.round] = set_span_in_context(
+                    step_inv.span
+                )
+            except Exception as e:  # noqa: BLE001
+                logger.debug("Failed to compute step parent context: %s", e)
+
+        # tool name → description (for gen_ai.tool.description)
+        tool_desc_map = {}
+        tools = inference_data.get("tools") if isinstance(
+            inference_data, dict
+        ) else None
+        if isinstance(tools, list):
+            for tool in tools:
+                if not isinstance(tool, dict):
+                    continue
+                func = tool.get("function") or tool
+                if not isinstance(func, dict):
+                    continue
+                name = func.get("name")
+                desc = func.get("description")
+                if name:
+                    tool_desc_map[name] = desc
+
+        # Extract tool observations from final messages keyed by tool_call_id;
+        # wtb only embeds them in messages (not in inference_answer) for the
+        # tool_call branch.
+        observation_by_call_id = {}
+        messages = inference_data.get("messages") if isinstance(
+            inference_data, dict
+        ) else None
+        if isinstance(messages, list):
+            for msg in messages:
+                if not isinstance(msg, dict) or msg.get("role") != "tool":
+                    continue
+                tid = msg.get("tool_call_id")
+                if tid is None:
+                    continue
+                content = msg.get("content")
+                if content is None:
+                    continue
+                observation_by_call_id[tid] = (
+                    content if isinstance(content, str) else _stringify(content)
+                )
+
+        for key in sorted(k for k in inference_log if k.startswith("step_")):
+            try:
+                step_idx = int(key[len("step_"):])
+            except ValueError:
+                continue
+            round_num = step_idx + 1
+
+            step_data = inference_log[key]
+            if not isinstance(step_data, dict):
+                continue
+            output = step_data.get("inference_output") or {}
+            if not isinstance(output, dict):
+                continue
+            tool_calls = output.get("tool_calls")
+            label = output.get("current_action_name_label")
+            if not tool_calls or label != "correct":
+                continue
+
+            answer_data = step_data.get("inference_answer") or {}
+            candidate = (
+                answer_data.get("candidate_0_answer_function_list")
+                if isinstance(answer_data, dict)
+                else None
+            ) or {}
+            candidate_observation = (
+                candidate.get("observation")
+                if isinstance(candidate, dict)
+                else None
+            )
+
+            parent_ctx = step_ctx_by_round.get(round_num)
+
+            for tc in tool_calls:
+                if not isinstance(tc, dict):
+                    continue
+                func = tc.get("function") or {}
+                if not isinstance(func, dict):
+                    func = {}
+                tool_name = func.get("name", "unknown")
+                tool_id = tc.get("id")
+                tool_args_raw = func.get("arguments", "")
+                tool_args_str = (
+                    tool_args_raw
+                    if isinstance(tool_args_raw, str)
+                    else _stringify(tool_args_raw)
+                )
+
+                observation_str: Optional[str] = None
+                if tool_id is not None and tool_id in observation_by_call_id:
+                    observation_str = observation_by_call_id[tool_id]
+                elif candidate_observation is not None:
+                    observation_str = (
+                        candidate_observation
+                        if isinstance(candidate_observation, str)
+                        else _stringify(candidate_observation)
+                    )
+
+                description = tool_desc_map.get(tool_name)
+
+                invocation = ExecuteToolInvocation(
+                    tool_name=tool_name,
+                    tool_call_id=tool_id,
+                    tool_call_arguments=tool_args_str,
+                    tool_call_result=observation_str,
+                    tool_type="function",
+                    tool_description=description,
+                    attributes={
+                        "wildtool.tool.execution_mode": "ground_truth_replay",
+                    },
+                )
+
+                try:
+                    self._handler.start_execute_tool(
+                        invocation, context=parent_ctx
+                    )
+                except Exception as e:  # noqa: BLE001
+                    logger.debug("Failed to start_execute_tool: %s", e)
+                    continue
+
+                # M3: explicitly write tool_call sensitive attrs. The
+                # util-genai `_get_tool_call_data_attributes` helper guards
+                # these behind experimental-mode + content-capture-mode env
+                # vars which are not always set in real deployments.
+                tool_span = invocation.span
+                if tool_span is not None and tool_span.is_recording():
+                    try:
+                        tool_span.set_attribute(
+                            "gen_ai.tool.call.arguments", tool_args_str
+                        )
+                        if observation_str is not None:
+                            tool_span.set_attribute(
+                                "gen_ai.tool.call.result", observation_str
+                            )
+                        if description:
+                            tool_span.set_attribute(
+                                "gen_ai.tool.description", description
+                            )
+                    except Exception as e:  # noqa: BLE001
+                        logger.debug("Failed to set tool span attrs: %s", e)
+
+                try:
+                    self._handler.stop_execute_tool(invocation)
+                except Exception as e:  # noqa: BLE001
+                    logger.debug("Failed to stop_execute_tool: %s", e)
+
+
+class WildToolRequestWrapper:
+    """P4: Wraps BaseHandler._request_tool_call.
+
+    Creates STEP span (ReactStepInvocation) before each LLM call.
+    Extracts latency from return value. Also writes the H2 provider-name
+    fallback attributes (``gen_ai.system`` + ``gen_ai.provider.name``) on
+    the STEP span so the new semconv attribute is present in the trace
+    even when the upstream OpenAI v2 instrumentation only emits the legacy
+    ``gen_ai.system``.
+    """
+
+    def __init__(self, handler: ExtendedTelemetryHandler):
+        self._handler = handler
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        if not _in_chain.get():
+            return wrapped(*args, **kwargs)
+
+        # Close the previous step (the natural end-of-step is when the next
+        # request fires). The STEP span's SpanContext stays valid as a
+        # parent for TOOL spans created later.
+        _close_active_step(self._handler)
+
+        step_num = _step_counter.get() + 1
+        _step_counter.set(step_num)
+
+        step_inv = ReactStepInvocation(round=step_num)
+        try:
+            self._handler.start_react_step(step_inv)
+        except Exception as e:  # noqa: BLE001
+            logger.debug("Failed to start react step: %s", e)
+            return wrapped(*args, **kwargs)
+
+        # H2: provider-name fallback attributes. Written on the STEP, not
+        # on the LLM span, because the LLM span is owned by the OpenAI v2
+        # provider instrumentation and is created lazily inside the wtb
+        # request implementation.
+        if step_inv.span is not None and step_inv.span.is_recording():
+            try:
+                step_inv.span.set_attribute(
+                    "gen_ai.system", _PROVIDER_FALLBACK_NAME
+                )
+                step_inv.span.set_attribute(
+                    "gen_ai.provider.name", _PROVIDER_FALLBACK_NAME
+                )
+            except Exception as e:  # noqa: BLE001
+                logger.debug("Failed to set provider fallback attrs: %s", e)
+
+        # Track this step for H1 TOOL re-parenting.
+        chain_steps = _chain_step_invocations.get()
+        if chain_steps is not None:
+            chain_steps.append(step_inv)
+        _step_invocation.set(step_inv)
+
+        try:
+            result = wrapped(*args, **kwargs)
+            if isinstance(result, tuple) and len(result) == 2:
+                _, latency = result
+                if step_inv.span and step_inv.span.is_recording():
+                    try:
+                        step_inv.span.set_attribute(
+                            "wildtool.latency", float(latency)
+                        )
+                    except Exception as e:  # noqa: BLE001
+                        logger.debug("Failed to set wildtool.latency: %s", e)
+            return result
+        except Exception as e:
+            step_inv.finish_reason = "error"
+            self._handler.fail_react_step(
+                step_inv, Error(message=str(e), type=type(e))
+            )
+            _step_invocation.set(None)
+            raise
+
+
+class WildToolParseWrapper:
+    """P5: Wraps BaseHandler._parse_api_response.
+
+    Extracts token counts from parsed response and sets them on the
+    current STEP span as attributes.
+    """
+
+    def __init__(self, handler: ExtendedTelemetryHandler):
+        self._handler = handler
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        result = wrapped(*args, **kwargs)
+
+        step_inv = _step_invocation.get()
+        if step_inv and step_inv.span and step_inv.span.is_recording():
+            if isinstance(result, dict):
+                input_t = result.get("input_token")
+                output_t = result.get("output_token")
+                if input_t is not None:
+                    step_inv.span.set_attribute(
+                        "gen_ai.usage.input_tokens", input_t
+                    )
+                if output_t is not None:
+                    step_inv.span.set_attribute(
+                        "gen_ai.usage.output_tokens", output_t
+                    )
+
+        return result
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/package.py
new file mode 100644
index 000000000..1ac5bcfee
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/package.py
@@ -0,0 +1,2 @@
+_instruments = ("openai >= 1.0.0",)
+_supports_metrics = False
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/utils.py
new file mode 100644
index 000000000..c26b7711d
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/utils.py
@@ -0,0 +1,17 @@
+"""Utility functions for WildToolBench instrumentation."""
+
+import json
+from typing import Any, Optional
+
+
+def safe_json_dumps(obj: Any, max_length: int = 10000) -> Optional[str]:
+    """Safely serialize object to JSON string with length limit."""
+    if obj is None:
+        return None
+    try:
+        s = json.dumps(obj, ensure_ascii=False)
+        if len(s) > max_length:
+            return s[:max_length] + "...(truncated)"
+        return s
+    except (TypeError, ValueError):
+        return str(obj)[:max_length]
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/version.py
new file mode 100644
index 000000000..3dc1f76bc
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/version.py
@@ -0,0 +1 @@
+__version__ = "0.1.0"
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/conftest.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/conftest.py
new file mode 100644
index 000000000..014186185
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/conftest.py
@@ -0,0 +1,182 @@
+"""Test configuration for WildToolBench instrumentation tests."""
+
+import json
+import os
+
+import pytest
+
+os.environ.setdefault("OPENAI_API_KEY", "test_key_not_real")
+os.environ.setdefault("OPENAI_BASE_URL", "http://localhost:9999/v1")
+
+from opentelemetry.instrumentation.wildtool import WildToolInstrumentor
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+from opentelemetry.sdk.trace.export.in_memory_span_exporter import (
+    InMemorySpanExporter,
+)
+
+
+def pytest_configure(config: pytest.Config):
+    os.environ["OTEL_SEMCONV_STABILITY_OPT_IN"] = "gen_ai_latest_experimental"
+
+
+@pytest.fixture(scope="function", name="span_exporter")
+def fixture_span_exporter():
+    exporter = InMemorySpanExporter()
+    yield exporter
+
+
+@pytest.fixture(scope="function", name="tracer_provider")
+def fixture_tracer_provider(span_exporter):
+    provider = TracerProvider()
+    provider.add_span_processor(SimpleSpanProcessor(span_exporter))
+    return provider
+
+
+@pytest.fixture(scope="function")
+def instrument(tracer_provider):
+    instrumentor = WildToolInstrumentor()
+    instrumentor.instrument(
+        tracer_provider=tracer_provider,
+        skip_dep_check=True,
+    )
+    yield instrumentor
+    instrumentor.uninstrument()
+
+
+# ==================== Minimal test data fixtures ====================
+
+
+def _make_chat_completion_response(
+    content=None,
+    tool_calls=None,
+    input_tokens=10,
+    output_tokens=5,
+    model="gpt-4o",
+):
+    """Build a minimal ChatCompletion-like dict that can be JSON-serialized."""
+    message = {"role": "assistant", "content": content or ""}
+    if tool_calls:
+        message["tool_calls"] = tool_calls
+    return {
+        "id": "chatcmpl-test",
+        "object": "chat.completion",
+        "model": model,
+        "choices": [{"index": 0, "message": message, "finish_reason": "stop"}],
+        "usage": {
+            "prompt_tokens": input_tokens,
+            "completion_tokens": output_tokens,
+            "total_tokens": input_tokens + output_tokens,
+        },
+    }
+
+
+class FakeChatCompletion:
+    """Mimics openai.types.chat.ChatCompletion enough for _parse_api_response."""
+
+    def __init__(self, data: dict):
+        self._data = data
+
+    def json(self):
+        return json.dumps(self._data)
+
+    def __getattr__(self, name):
+        return self._data[name]
+
+
+@pytest.fixture()
+def make_completion():
+    """Factory fixture to build FakeChatCompletion objects."""
+
+    def _factory(**kwargs):
+        return FakeChatCompletion(_make_chat_completion_response(**kwargs))
+
+    return _factory
+
+
+@pytest.fixture()
+def simple_test_entry():
+    """A minimal WildToolBench test_entry with 1 task, 1 step (prepare_to_answer)."""
+    return {
+        "id": "wild_tool_bench_test_001",
+        "english_env_info": "2025-01-01",
+        "english_tools": [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "description": "Get weather for a city",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "city": {"type": "string"},
+                        },
+                        "required": ["city"],
+                    },
+                },
+            }
+        ],
+        "english_tasks": ["What is the weather in Beijing?"],
+        "english_answer_list": [
+            [
+                {
+                    "action": {
+                        "name": "get_weather",
+                        "arguments": {"city": "Beijing"},
+                    },
+                    "observation": "Sunny, 25°C",
+                    "dependency_list": [],
+                },
+                {
+                    "action": {
+                        "name": "prepare_to_answer",
+                        "arguments": {},
+                    },
+                    "observation": "The weather in Beijing is Sunny, 25°C",
+                    "dependency_list": [0],
+                },
+            ]
+        ],
+    }
+
+
+@pytest.fixture()
+def tool_call_response_factory():
+    """Factory to make tool_call ChatCompletion responses."""
+
+    def _factory(tool_name, arguments, tool_call_id="call_001"):
+        tc = [
+            {
+                "id": tool_call_id,
+                "type": "function",
+                "function": {
+                    "name": tool_name,
+                    "arguments": (
+                        json.dumps(arguments)
+                        if isinstance(arguments, dict)
+                        else arguments
+                    ),
+                },
+            }
+        ]
+        return FakeChatCompletion(
+            _make_chat_completion_response(tool_calls=tc)
+        )
+
+    return _factory
+
+
+@pytest.fixture()
+def text_response_factory():
+    """Factory to make text-only ChatCompletion responses."""
+
+    def _factory(content, input_tokens=10, output_tokens=5):
+        return FakeChatCompletion(
+            _make_chat_completion_response(
+                content=content,
+                input_tokens=input_tokens,
+                output_tokens=output_tokens,
+            )
+        )
+
+    return _factory
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_agent_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_agent_span.py
new file mode 100644
index 000000000..2929eeb33
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_agent_span.py
@@ -0,0 +1,108 @@
+"""Tests for AGENT span (P2: inference_multi_turn)."""
+
+import json
+from unittest.mock import patch
+
+import pytest
+from opentelemetry.trace import StatusCode
+
+from wtb.model_handler.base_handler import BaseHandler
+
+
+class _StubHandler(BaseHandler):
+    """Minimal handler subclass for testing AGENT span."""
+
+    def __init__(self):
+        super().__init__("test-model", 0.0)
+        self._step_responses = []
+        self._step_idx = 0
+
+    def _request_tool_call(self, inference_data):
+        resp = self._step_responses[self._step_idx]
+        self._step_idx += 1
+        return resp, 0.1
+
+    def _parse_api_response(self, api_response):
+        data = json.loads(api_response.json())
+        choice = data["choices"][0]
+        message = choice["message"]
+        return {
+            "reasoning_content": None,
+            "content": message.get("content"),
+            "tool_calls": message.get("tool_calls"),
+            "input_token": data["usage"]["prompt_tokens"],
+            "output_token": data["usage"]["completion_tokens"],
+        }
+
+
+class TestAgentSpan:
+    def test_agent_span_attributes(
+        self, span_exporter, instrument, simple_test_entry, make_completion,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """AGENT span should exist with correct attributes and token aggregation."""
+        handler = _StubHandler()
+
+        # Step 0: model returns tool call for get_weather
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        # Step 1: model returns text (prepare_to_answer match)
+        resp1 = text_response_factory(
+            "The weather in Beijing is Sunny, 25°C",
+            input_tokens=20, output_tokens=15,
+        )
+        handler._step_responses = [resp0, resp1]
+
+        result = handler.inference_multi_turn(simple_test_entry)
+        assert result is not None
+
+        spans = span_exporter.get_finished_spans()
+        agent_spans = [s for s in spans if "invoke_agent" in s.name]
+        assert len(agent_spans) == 1
+
+        span = agent_spans[0]
+        assert span.name == "invoke_agent _StubHandler"
+        attrs = dict(span.attributes or {})
+        assert attrs.get("gen_ai.span.kind") == "AGENT"
+        assert attrs.get("gen_ai.operation.name") == "invoke_agent"
+        assert attrs.get("gen_ai.framework") == "wildtool"
+        assert attrs.get("gen_ai.agent.name") == "_StubHandler"
+        assert attrs.get("gen_ai.conversation.id") == "wild_tool_bench_test_001"
+        assert attrs.get("gen_ai.request.model") == "test-model"
+        assert attrs.get("wildtool.turn_count") == 1
+
+        assert attrs.get("gen_ai.usage.input_tokens") == 30
+        assert attrs.get("gen_ai.usage.output_tokens") == 20
+
+    def test_agent_parent_is_entry(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """When called via multi_threaded_inference, AGENT span should be child of ENTRY."""
+        from wtb._llm_response_generation import multi_threaded_inference
+
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C")
+        handler._step_responses = [resp0, resp1]
+
+        test_case = simple_test_entry.copy()
+        multi_threaded_inference(handler, "test-model", test_case)
+
+        spans = span_exporter.get_finished_spans()
+        entry_spans = [
+            s for s in spans if s.name == "enter_ai_application_system"
+        ]
+        agent_spans = [s for s in spans if "invoke_agent" in s.name]
+
+        assert len(entry_spans) == 1
+        assert len(agent_spans) == 1
+
+        entry = entry_spans[0]
+        agent = agent_spans[0]
+        assert agent.context.trace_id == entry.context.trace_id
+        assert agent.parent is not None
+        assert agent.parent.span_id == entry.context.span_id
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_chain_step_tool_spans.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_chain_step_tool_spans.py
new file mode 100644
index 000000000..d7dd7b4aa
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_chain_step_tool_spans.py
@@ -0,0 +1,283 @@
+"""Tests for CHAIN / STEP / TOOL spans (P3, P4, P5)."""
+
+import json
+
+import pytest
+from opentelemetry.trace import StatusCode
+
+from wtb.model_handler.base_handler import BaseHandler
+
+
+class _StubHandler(BaseHandler):
+    """Minimal handler subclass with controllable responses."""
+
+    def __init__(self):
+        super().__init__("test-model", 0.0)
+        self._step_responses = []
+        self._step_idx = 0
+
+    def _request_tool_call(self, inference_data):
+        resp = self._step_responses[self._step_idx]
+        self._step_idx += 1
+        return resp, 0.05
+
+    def _parse_api_response(self, api_response):
+        data = json.loads(api_response.json())
+        choice = data["choices"][0]
+        message = choice["message"]
+        return {
+            "reasoning_content": None,
+            "content": message.get("content"),
+            "tool_calls": message.get("tool_calls"),
+            "input_token": data["usage"]["prompt_tokens"],
+            "output_token": data["usage"]["completion_tokens"],
+        }
+
+
+class TestChainSpan:
+    def test_chain_span_per_task(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """Each task should produce one CHAIN span with correct attributes."""
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C")
+        handler._step_responses = [resp0, resp1]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        chain_spans = [s for s in spans if s.name.startswith("workflow")]
+        assert len(chain_spans) == 1
+
+        chain = chain_spans[0]
+        assert chain.name == "workflow task_0"
+        attrs = dict(chain.attributes or {})
+        assert attrs.get("gen_ai.span.kind") == "CHAIN"
+        assert attrs.get("gen_ai.operation.name") == "workflow"
+        assert attrs.get("gen_ai.framework") == "wildtool"
+        assert attrs.get("wildtool.task_idx") == 0
+        assert attrs.get("wildtool.test_entry_id") == "wild_tool_bench_test_001"
+        assert attrs.get("wildtool.action_name_label") == "correct"
+        assert attrs.get("wildtool.is_optimal") is True
+
+    def test_chain_parent_is_agent(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """CHAIN span should be child of AGENT span."""
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C")
+        handler._step_responses = [resp0, resp1]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        agent_spans = [s for s in spans if "invoke_agent" in s.name]
+        chain_spans = [s for s in spans if s.name.startswith("workflow")]
+
+        assert len(agent_spans) == 1
+        assert len(chain_spans) == 1
+
+        agent = agent_spans[0]
+        chain = chain_spans[0]
+        assert chain.context.trace_id == agent.context.trace_id
+        assert chain.parent is not None
+        assert chain.parent.span_id == agent.context.span_id
+
+
+class TestStepSpans:
+    def test_step_spans_per_chain(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """Each _request_tool_call invocation should produce a STEP span."""
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C")
+        handler._step_responses = [resp0, resp1]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        step_spans = [s for s in spans if s.name == "react step"]
+        assert len(step_spans) == 2
+
+        attrs0 = dict(step_spans[0].attributes or {})
+        attrs1 = dict(step_spans[1].attributes or {})
+        rounds = sorted([attrs0.get("gen_ai.react.round"), attrs1.get("gen_ai.react.round")])
+        assert rounds == [1, 2]
+
+        for ss in step_spans:
+            a = dict(ss.attributes or {})
+            assert a.get("gen_ai.span.kind") == "STEP"
+            assert a.get("gen_ai.operation.name") == "react"
+
+    def test_step_parent_is_chain(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """STEP spans should be children of CHAIN span."""
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C")
+        handler._step_responses = [resp0, resp1]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        chain_spans = [s for s in spans if s.name.startswith("workflow")]
+        step_spans = [s for s in spans if s.name == "react step"]
+
+        assert len(chain_spans) == 1
+        chain = chain_spans[0]
+
+        for ss in step_spans:
+            assert ss.context.trace_id == chain.context.trace_id
+            assert ss.parent is not None
+            assert ss.parent.span_id == chain.context.span_id
+
+    def test_step_token_attributes(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """STEP span should have gen_ai.usage.input_tokens and output_tokens."""
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory(
+            "The weather in Beijing is Sunny, 25°C",
+            input_tokens=25, output_tokens=12,
+        )
+        handler._step_responses = [resp0, resp1]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        step_spans = sorted(
+            [s for s in spans if s.name == "react step"],
+            key=lambda s: s.attributes.get("gen_ai.react.round", 0),
+        )
+        assert len(step_spans) == 2
+
+        # First step: default 10 input, 5 output from make_completion defaults
+        a0 = dict(step_spans[0].attributes or {})
+        assert a0.get("gen_ai.usage.input_tokens") == 10
+        assert a0.get("gen_ai.usage.output_tokens") == 5
+
+        # Second step: 25 input, 12 output
+        a1 = dict(step_spans[1].attributes or {})
+        assert a1.get("gen_ai.usage.input_tokens") == 25
+        assert a1.get("gen_ai.usage.output_tokens") == 12
+
+
+class TestToolSpans:
+    def test_tool_span_attributes(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """TOOL span should have correct attributes including execution_mode."""
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C")
+        handler._step_responses = [resp0, resp1]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        tool_spans = [s for s in spans if "execute_tool" in s.name]
+        assert len(tool_spans) == 1
+
+        tool = tool_spans[0]
+        assert tool.name == "execute_tool get_weather"
+        attrs = dict(tool.attributes or {})
+        assert attrs.get("gen_ai.span.kind") == "TOOL"
+        assert attrs.get("gen_ai.operation.name") == "execute_tool"
+        assert attrs.get("gen_ai.tool.name") == "get_weather"
+        assert attrs.get("gen_ai.tool.type") == "function"
+        assert (
+            attrs.get("wildtool.tool.execution_mode") == "ground_truth_replay"
+        )
+
+    def test_tool_span_parent_is_chain(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """TOOL spans share the CHAIN trace_id (parent is STEP after Round 2)."""
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C")
+        handler._step_responses = [resp0, resp1]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        chain_spans = [s for s in spans if s.name.startswith("workflow")]
+        tool_spans = [s for s in spans if "execute_tool" in s.name]
+
+        assert len(chain_spans) == 1
+        assert len(tool_spans) >= 1
+
+        chain = chain_spans[0]
+        for ts in tool_spans:
+            assert ts.context.trace_id == chain.context.trace_id
+
+
+class TestSpanHierarchy:
+    def test_full_hierarchy(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """Verify ENTRY → AGENT → CHAIN → STEP hierarchy and consistent trace_id."""
+        from wtb._llm_response_generation import multi_threaded_inference
+
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C")
+        handler._step_responses = [resp0, resp1]
+
+        test_case = simple_test_entry.copy()
+        multi_threaded_inference(handler, "test-model", test_case)
+
+        spans = span_exporter.get_finished_spans()
+
+        entry = [s for s in spans if s.name == "enter_ai_application_system"]
+        agent = [s for s in spans if "invoke_agent" in s.name]
+        chain = [s for s in spans if s.name.startswith("workflow")]
+        step = [s for s in spans if s.name == "react step"]
+        tool = [s for s in spans if "execute_tool" in s.name]
+
+        assert len(entry) == 1
+        assert len(agent) == 1
+        assert len(chain) == 1
+        assert len(step) == 2
+        assert len(tool) >= 1
+
+        trace_id = entry[0].context.trace_id
+        for s in spans:
+            assert s.context.trace_id == trace_id
+
+        # AGENT parent = ENTRY
+        assert agent[0].parent.span_id == entry[0].context.span_id
+        # CHAIN parent = AGENT
+        assert chain[0].parent.span_id == agent[0].context.span_id
+        # STEP parent = CHAIN
+        for s in step:
+            assert s.parent.span_id == chain[0].context.span_id
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_entry_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_entry_span.py
new file mode 100644
index 000000000..834e7dd13
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_entry_span.py
@@ -0,0 +1,115 @@
+"""Tests for ENTRY span (P1: multi_threaded_inference).
+
+Module-level imports of ``wtb._llm_response_generation.multi_threaded_inference``
+must be avoided: ``wrapt.wrap_function_wrapper`` patches the attribute on the
+module, but a pre-imported local binding still references the original
+unwrapped function. All tests therefore import the symbol lazily after the
+``instrument`` fixture has run.
+"""
+
+import pytest
+from opentelemetry.trace import StatusCode
+
+from wtb.model_handler.base_handler import BaseHandler
+
+
+class _StubHandler(BaseHandler):
+    """Minimal handler subclass for testing.
+
+    Overrides ``inference`` so the multi_threaded_inference wrapper invokes a
+    deterministic, side-effect-free body that returns a fake result dict and
+    therefore exercises only the ENTRY span codepath.
+    """
+
+    def __init__(self):
+        super().__init__("test-model", 0.0)
+
+    def _request_tool_call(self, inference_data):
+        raise NotImplementedError
+
+    def _parse_api_response(self, api_response):
+        raise NotImplementedError
+
+    def inference(self, test_entry):
+        return [
+            {
+                "action_name_label": "correct",
+                "is_optimal": True,
+                "inference_log": {},
+                "latency": [0.1],
+                "input_token_count": [10],
+                "output_token_count": [5],
+            }
+        ]
+
+
+class TestEntrySpan:
+    def test_entry_span_created(self, span_exporter, instrument):
+        """ENTRY span should be created with correct attributes."""
+        from wtb._llm_response_generation import multi_threaded_inference
+
+        handler = _StubHandler()
+        test_case = {
+            "id": "wild_tool_bench_test_001",
+            "english_tasks": ["task1", "task2"],
+        }
+
+        result = multi_threaded_inference(handler, "gpt-4o", test_case)
+
+        assert result is not None
+        assert result["id"] == "wild_tool_bench_test_001"
+
+        spans = span_exporter.get_finished_spans()
+        entry_spans = [
+            s for s in spans if s.name == "enter_ai_application_system"
+        ]
+        assert len(entry_spans) == 1
+
+        span = entry_spans[0]
+        attrs = dict(span.attributes or {})
+        assert attrs.get("gen_ai.span.kind") == "ENTRY"
+        assert attrs.get("gen_ai.operation.name") == "enter"
+        assert attrs.get("gen_ai.framework") == "wildtool"
+        assert attrs.get("gen_ai.session.id") == "wild_tool_bench_test_001"
+        assert attrs.get("gen_ai.request.model") == "gpt-4o"
+        assert attrs.get("wildtool.turn_count") == 2
+        # ENTRY spans rely on default OTel status semantics: success leaves
+        # the span UNSET, failures explicitly mark it ERROR.
+        assert span.status.status_code != StatusCode.ERROR
+
+    def test_entry_span_error_path(self, span_exporter, instrument):
+        """The ENTRY wrapper marks the span ERROR when the wrapped callable
+        raises an unhandled exception.
+
+        ``multi_threaded_inference`` swallows non-rate-limit errors itself
+        (see test_error_scenarios.test_entry_span_captures_retry_error_path
+        for that path). To exercise the wrapper's failure branch directly we
+        invoke the underlying ``WildToolEntryWrapper`` with a callable that
+        deliberately raises, bypassing ``multi_threaded_inference``'s own
+        error handling.
+        """
+        from opentelemetry.instrumentation.wildtool._wrappers import (
+            WildToolEntryWrapper,
+        )
+
+        wrapper = WildToolEntryWrapper(instrument._handler)
+
+        def _raising(handler, model_name, test_case):
+            raise RuntimeError("API connection failed")
+
+        handler = _StubHandler()
+        test_case = {
+            "id": "wild_tool_bench_test_002",
+            "english_tasks": ["task1"],
+        }
+
+        with pytest.raises(RuntimeError, match="API connection failed"):
+            wrapper(_raising, None, (handler, "gpt-4o", test_case), {})
+
+        spans = span_exporter.get_finished_spans()
+        entry_spans = [
+            s for s in spans if s.name == "enter_ai_application_system"
+        ]
+        assert len(entry_spans) == 1
+        span = entry_spans[0]
+        assert span.status.status_code == StatusCode.ERROR
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_error_scenarios.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_error_scenarios.py
new file mode 100644
index 000000000..c14a3f40c
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_error_scenarios.py
@@ -0,0 +1,135 @@
+"""Tests for error/edge-case scenarios."""
+
+import json
+
+import pytest
+from opentelemetry.trace import StatusCode
+
+from wtb.model_handler.base_handler import BaseHandler
+
+
+class _StubHandler(BaseHandler):
+    """Handler with controllable step responses."""
+
+    def __init__(self):
+        super().__init__("test-model", 0.0)
+        self._step_responses = []
+        self._step_idx = 0
+
+    def _request_tool_call(self, inference_data):
+        resp = self._step_responses[self._step_idx]
+        self._step_idx += 1
+        if isinstance(resp, Exception):
+            raise resp
+        return resp, 0.05
+
+    def _parse_api_response(self, api_response):
+        data = json.loads(api_response.json())
+        choice = data["choices"][0]
+        message = choice["message"]
+        return {
+            "reasoning_content": None,
+            "content": message.get("content"),
+            "tool_calls": message.get("tool_calls"),
+            "input_token": data["usage"]["prompt_tokens"],
+            "output_token": data["usage"]["completion_tokens"],
+        }
+
+
+class TestErrorScenarios:
+    def test_action_name_mismatch(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory,
+    ):
+        """When model calls wrong tool, CHAIN span should still be OK with error label."""
+        handler = _StubHandler()
+        # Model calls wrong_tool instead of get_weather
+        resp0 = tool_call_response_factory(
+            "wrong_tool", {"x": 1}, "call_bad"
+        )
+        handler._step_responses = [resp0]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        chain_spans = [s for s in spans if s.name.startswith("workflow")]
+        assert len(chain_spans) == 1
+
+        chain = chain_spans[0]
+        attrs = dict(chain.attributes or {})
+        assert attrs.get("wildtool.action_name_label") == "error"
+        assert chain.status.status_code == StatusCode.OK
+
+    def test_empty_response(
+        self, span_exporter, instrument, simple_test_entry,
+        make_completion,
+    ):
+        """When model returns no content and no tool_calls, process terminates gracefully."""
+        from tests.conftest import FakeChatCompletion, _make_chat_completion_response
+
+        handler = _StubHandler()
+        resp = FakeChatCompletion(
+            _make_chat_completion_response(content="", tool_calls=None)
+        )
+        handler._step_responses = [resp]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        chain_spans = [s for s in spans if s.name.startswith("workflow")]
+        assert len(chain_spans) == 1
+        attrs = dict(chain_spans[0].attributes or {})
+        assert attrs.get("wildtool.action_name_label") == "error"
+
+    def test_request_tool_call_exception_sets_error(
+        self, span_exporter, instrument, simple_test_entry,
+    ):
+        """Exception in _request_tool_call should produce ERROR on STEP span and propagate."""
+        handler = _StubHandler()
+        handler._step_responses = [RuntimeError("Connection timeout")]
+
+        with pytest.raises(RuntimeError, match="Connection timeout"):
+            handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        step_spans = [s for s in spans if s.name == "react step"]
+        assert len(step_spans) == 1
+        assert step_spans[0].status.status_code == StatusCode.ERROR
+
+        chain_spans = [s for s in spans if s.name.startswith("workflow")]
+        assert len(chain_spans) == 1
+        assert chain_spans[0].status.status_code == StatusCode.ERROR
+
+    def test_entry_span_captures_retry_error_path(
+        self, span_exporter, instrument,
+    ):
+        """multi_threaded_inference catches non-rate-limit errors and returns error dict.
+        ENTRY span should still complete successfully (not raise)."""
+        from wtb._llm_response_generation import multi_threaded_inference
+
+        handler = _StubHandler()
+
+        def failing_inference(test_entry):
+            raise ValueError("Invalid JSON from model")
+
+        handler.inference = failing_inference
+
+        test_case = {
+            "id": "wild_tool_bench_err_001",
+            "english_tasks": ["task1"],
+        }
+
+        # multi_threaded_inference catches non-rate-limit errors
+        result = multi_threaded_inference(handler, "test-model", test_case)
+        assert "Error during inference" in result["result"]
+
+        spans = span_exporter.get_finished_spans()
+        entry_spans = [
+            s for s in spans if s.name == "enter_ai_application_system"
+        ]
+        assert len(entry_spans) == 1
+        # multi_threaded_inference's own try/except converts the error into a
+        # normal return, so the ENTRY wrapper observes a successful call and
+        # leaves the span at the default UNSET status (definitely not ERROR).
+        span = entry_spans[0]
+        assert span.status.status_code != StatusCode.ERROR
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_instrumentor.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_instrumentor.py
new file mode 100644
index 000000000..a8be5b4da
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_instrumentor.py
@@ -0,0 +1,20 @@
+"""Tests for WildToolInstrumentor lifecycle."""
+
+from opentelemetry.instrumentation.wildtool import WildToolInstrumentor
+
+
+class TestWildToolInstrumentor:
+    def test_instrument_and_uninstrument(self, tracer_provider):
+        instrumentor = WildToolInstrumentor()
+        instrumentor.instrument(
+            tracer_provider=tracer_provider,
+            skip_dep_check=True,
+        )
+        assert instrumentor._handler is not None
+        instrumentor.uninstrument()
+        assert instrumentor._handler is None
+
+    def test_instrumentation_dependencies(self):
+        instrumentor = WildToolInstrumentor()
+        deps = instrumentor.instrumentation_dependencies()
+        assert ("openai >= 1.0.0",) == deps
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_round2_fixes.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_round2_fixes.py
new file mode 100644
index 000000000..9f4f4d895
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_round2_fixes.py
@@ -0,0 +1,441 @@
+"""Round 2 regression tests covering the H1 / H2 / M1 / M2 / M3 fixes.
+
+See ``llm-dev/execute.md`` § "修订记录 (Round 2 fix)" and
+``example-deploy/validation/SUMMARY.md`` for the original validation gaps
+addressed by these tests.
+"""
+
+import json
+
+import pytest
+from opentelemetry.trace import StatusCode
+
+from wtb.model_handler.base_handler import BaseHandler
+
+
+class _StubHandler(BaseHandler):
+    """Minimal handler with controllable LLM responses (no real network)."""
+
+    def __init__(self):
+        super().__init__("test-model", 0.0)
+        self._step_responses = []
+        self._step_idx = 0
+
+    def _request_tool_call(self, inference_data):
+        resp = self._step_responses[self._step_idx]
+        self._step_idx += 1
+        if isinstance(resp, Exception):
+            raise resp
+        return resp, 0.05
+
+    def _parse_api_response(self, api_response):
+        data = json.loads(api_response.json())
+        choice = data["choices"][0]
+        message = choice["message"]
+        return {
+            "reasoning_content": None,
+            "content": message.get("content"),
+            "tool_calls": message.get("tool_calls"),
+            "input_token": data["usage"]["prompt_tokens"],
+            "output_token": data["usage"]["completion_tokens"],
+        }
+
+
+def _spans_by_kind(spans, kind):
+    return [s for s in spans if (s.attributes or {}).get("gen_ai.span.kind") == kind]
+
+
+def _spans_named(spans, name):
+    return [s for s in spans if s.name == name]
+
+
+def _step_for_round(spans, round_num):
+    for s in _spans_named(spans, "react step"):
+        attrs = s.attributes or {}
+        if attrs.get("gen_ai.react.round") == round_num:
+            return s
+    raise AssertionError(f"no STEP span found for round={round_num}")
+
+
+# ============================================================================
+# H1: TOOL span parent_span_id == STEP span_id (was CHAIN before fix)
+# ============================================================================
+
+
+class TestToolParentIsStep:
+    def test_single_tool_parent_is_step_round_one(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """The single TOOL span in simple_test_entry should be a child of the
+        first STEP span (round=1), not the CHAIN span."""
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C")
+        handler._step_responses = [resp0, resp1]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        tool_spans = _spans_by_kind(spans, "TOOL")
+        assert len(tool_spans) == 1, [s.name for s in spans]
+
+        tool = tool_spans[0]
+        step_round1 = _step_for_round(spans, 1)
+        chain = _spans_by_kind(spans, "CHAIN")[0]
+
+        # H1 core assertion: parent is STEP, not CHAIN.
+        assert tool.parent is not None
+        assert tool.parent.span_id == step_round1.context.span_id, (
+            "TOOL parent should be STEP round=1, got "
+            f"{tool.parent.span_id} (STEP={step_round1.context.span_id}, "
+            f"CHAIN={chain.context.span_id})"
+        )
+        assert tool.parent.span_id != chain.context.span_id
+
+        # And trace_id of course remains consistent.
+        assert tool.context.trace_id == step_round1.context.trace_id
+
+    def test_multi_step_each_tool_parented_to_correct_step(
+        self, span_exporter, instrument,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """multi-step scenario: 2 successful tool steps + 1 prepare_to_answer.
+
+        Each TOOL span must be parented to the STEP span of its own round,
+        not to the CHAIN or to a different round's STEP.
+        """
+        handler = _StubHandler()
+        # Test entry with 2 tool steps (search, lookup) then prepare_to_answer.
+        test_entry = {
+            "id": "wild_tool_bench_multi_001",
+            "english_env_info": "2025-01-01",
+            "english_tools": [
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "search",
+                        "description": "Search items",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {"q": {"type": "string"}},
+                            "required": ["q"],
+                        },
+                    },
+                },
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "lookup",
+                        "description": "Look up details",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {"id": {"type": "string"}},
+                            "required": ["id"],
+                        },
+                    },
+                },
+            ],
+            "english_tasks": ["Find and summarize item X"],
+            "english_answer_list": [
+                [
+                    {
+                        "action": {"name": "search", "arguments": {"q": "X"}},
+                        "observation": "found:item_42",
+                        "dependency_list": [],
+                    },
+                    {
+                        "action": {"name": "lookup", "arguments": {"id": "item_42"}},
+                        "observation": "details:hello",
+                        "dependency_list": [0],
+                    },
+                    {
+                        "action": {"name": "prepare_to_answer", "arguments": {}},
+                        "observation": "Item X is hello.",
+                        "dependency_list": [1],
+                    },
+                ]
+            ],
+        }
+
+        resp_step1 = tool_call_response_factory(
+            "search", {"q": "X"}, "call_search_1"
+        )
+        resp_step2 = tool_call_response_factory(
+            "lookup", {"id": "item_42"}, "call_lookup_1"
+        )
+        resp_step3 = text_response_factory("Item X is hello.")
+        handler._step_responses = [resp_step1, resp_step2, resp_step3]
+
+        handler.inference_multi_turn(test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        tool_spans = sorted(
+            _spans_by_kind(spans, "TOOL"),
+            key=lambda s: (s.attributes or {}).get("gen_ai.tool.name") or "",
+        )
+        assert len(tool_spans) == 2, [s.name for s in spans]
+
+        step_round1 = _step_for_round(spans, 1)
+        step_round2 = _step_for_round(spans, 2)
+        chain = _spans_by_kind(spans, "CHAIN")[0]
+
+        lookup_tool = next(
+            t for t in tool_spans
+            if (t.attributes or {}).get("gen_ai.tool.name") == "lookup"
+        )
+        search_tool = next(
+            t for t in tool_spans
+            if (t.attributes or {}).get("gen_ai.tool.name") == "search"
+        )
+
+        # search → STEP round=1, lookup → STEP round=2
+        assert search_tool.parent.span_id == step_round1.context.span_id
+        assert lookup_tool.parent.span_id == step_round2.context.span_id
+        # Neither parented on CHAIN (the regression we are fixing)
+        for t in tool_spans:
+            assert t.parent.span_id != chain.context.span_id
+            assert t.context.trace_id == chain.context.trace_id
+
+
+# ============================================================================
+# M1: CHAIN span carries input.value and output.value
+# ============================================================================
+
+
+class TestChainInputOutputValue:
+    def test_chain_input_value_and_output_value(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C")
+        handler._step_responses = [resp0, resp1]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        chain_spans = _spans_by_kind(spans, "CHAIN")
+        assert len(chain_spans) == 1
+        attrs = dict(chain_spans[0].attributes or {})
+
+        # input.value: last user message of the chain (prepared by wtb's
+        # _pre_messages_processing which appends the current task as user).
+        assert "input.value" in attrs, attrs
+        assert attrs["input.value"] == "What is the weather in Beijing?"
+
+        # output.value: JSON containing action_name_label, task_idx, is_optimal.
+        assert "output.value" in attrs, attrs
+        out = json.loads(attrs["output.value"])
+        assert out["action_name_label"] == "correct"
+        assert out["task_idx"] == 0
+        assert out["is_optimal"] is True
+
+    def test_chain_input_value_truncated_when_long(
+        self, span_exporter, instrument,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """Very long user content should be truncated to keep span attribute small."""
+        handler = _StubHandler()
+        long_text = "x" * 5000
+        test_entry = {
+            "id": "wild_tool_bench_long_001",
+            "english_env_info": "2025-01-01",
+            "english_tools": [
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "noop",
+                        "description": "noop",
+                        "parameters": {"type": "object", "properties": {}},
+                    },
+                }
+            ],
+            "english_tasks": [long_text],
+            "english_answer_list": [
+                [
+                    {
+                        "action": {"name": "prepare_to_answer", "arguments": {}},
+                        "observation": "ok",
+                        "dependency_list": [],
+                    }
+                ]
+            ],
+        }
+        handler._step_responses = [text_response_factory("ok")]
+
+        handler.inference_multi_turn(test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        chain = _spans_by_kind(spans, "CHAIN")[0]
+        attrs = dict(chain.attributes or {})
+        assert "input.value" in attrs
+        # Default cap is 4096; truncated form must be <= cap + suffix length.
+        assert len(attrs["input.value"]) <= 4096 + len("...(truncated)")
+        assert attrs["input.value"].startswith("xxx")
+
+
+# ============================================================================
+# M2: STEP span carries gen_ai.react.finish_reason on error paths
+# ============================================================================
+
+
+class TestStepFinishReason:
+    def test_finish_reason_action_name_mismatch(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory,
+    ):
+        handler = _StubHandler()
+        # wrong tool name → wtb's "action name not in candidate" branch
+        handler._step_responses = [
+            tool_call_response_factory("wrong_tool", {"x": 1}, "call_bad")
+        ]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        steps = _spans_named(spans, "react step")
+        assert len(steps) == 1
+        attrs = dict(steps[0].attributes or {})
+        assert attrs.get("gen_ai.react.finish_reason") == "action_name_mismatch"
+
+    def test_finish_reason_empty_response(
+        self, span_exporter, instrument, simple_test_entry, make_completion,
+    ):
+        """Empty content + no tool_calls → STEP gets finish_reason=empty_response."""
+        from tests.conftest import (
+            FakeChatCompletion,
+            _make_chat_completion_response,
+        )
+
+        handler = _StubHandler()
+        handler._step_responses = [
+            FakeChatCompletion(
+                _make_chat_completion_response(content="", tool_calls=None)
+            )
+        ]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        steps = _spans_named(spans, "react step")
+        assert len(steps) == 1
+        attrs = dict(steps[0].attributes or {})
+        assert attrs.get("gen_ai.react.finish_reason") == "empty_response"
+
+    def test_finish_reason_request_exception(
+        self, span_exporter, instrument, simple_test_entry,
+    ):
+        """Exception in _request_tool_call → STEP ERROR + finish_reason=error."""
+        handler = _StubHandler()
+        handler._step_responses = [RuntimeError("Boom")]
+
+        with pytest.raises(RuntimeError):
+            handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        steps = _spans_named(spans, "react step")
+        assert len(steps) == 1
+        attrs = dict(steps[0].attributes or {})
+        assert steps[0].status.status_code == StatusCode.ERROR
+        assert attrs.get("gen_ai.react.finish_reason") == "error"
+
+    def test_finish_reason_omitted_on_success(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """Successful steps should NOT have a finish_reason (per execute.md)."""
+        handler = _StubHandler()
+        handler._step_responses = [
+            tool_call_response_factory(
+                "get_weather", {"city": "Beijing"}, "call_001"
+            ),
+            text_response_factory("OK"),
+        ]
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        for s in _spans_named(spans, "react step"):
+            attrs = dict(s.attributes or {})
+            assert "gen_ai.react.finish_reason" not in attrs, (
+                f"unexpected finish_reason on success step round="
+                f"{attrs.get('gen_ai.react.round')}: {attrs.get('gen_ai.react.finish_reason')}"
+            )
+
+
+# ============================================================================
+# M3: TOOL span carries gen_ai.tool.call.arguments / result / description
+#     (and keeps wildtool.tool.execution_mode)
+# ============================================================================
+
+
+class TestToolSensitiveAttributes:
+    def test_tool_args_result_description_and_execution_mode(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("Sunny day")
+        handler._step_responses = [resp0, resp1]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        tool_spans = _spans_by_kind(spans, "TOOL")
+        assert len(tool_spans) == 1
+        attrs = dict(tool_spans[0].attributes or {})
+
+        # M3 explicit attrs.
+        args_attr = attrs.get("gen_ai.tool.call.arguments")
+        assert args_attr is not None
+        assert json.loads(args_attr) == {"city": "Beijing"}
+
+        # observation comes from the appended {"role": "tool", ...} message
+        # written by wtb after the call matches the answer; it's a string.
+        result_attr = attrs.get("gen_ai.tool.call.result")
+        assert result_attr == "Sunny, 25°C", attrs
+
+        # description sourced from inference_data["tools"][i].function.description
+        assert attrs.get("gen_ai.tool.description") == "Get weather for a city"
+
+        # Existing custom attribute must still be present.
+        assert (
+            attrs.get("wildtool.tool.execution_mode")
+            == "ground_truth_replay"
+        )
+
+
+# ============================================================================
+# H2: STEP span carries gen_ai.system / gen_ai.provider.name fallback
+# ============================================================================
+
+
+class TestStepProviderFallback:
+    def test_step_has_provider_name_fallback(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        handler = _StubHandler()
+        handler._step_responses = [
+            tool_call_response_factory(
+                "get_weather", {"city": "Beijing"}, "call_001"
+            ),
+            text_response_factory("OK"),
+        ]
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        steps = _spans_named(spans, "react step")
+        assert len(steps) == 2
+        for s in steps:
+            attrs = dict(s.attributes or {})
+            assert attrs.get("gen_ai.system") == "openai", attrs
+            assert attrs.get("gen_ai.provider.name") == "openai", attrs