From b7e7a4b0bcfc507c94858c4f17fc717eee6437ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=89=A7=E6=80=9D?= Date: Thu, 7 May 2026 09:32:45 +0800 Subject: [PATCH] feat: support wild-tool Change-Id: I0da98161cbdbe6a51b963bcc19f45a3d2d977968 --- .../README.md | 55 ++ .../pyproject.toml | 66 ++ .../instrumentation/wildtool/__init__.py | 161 +++++ .../instrumentation/wildtool/_wrappers.py | 644 ++++++++++++++++++ .../instrumentation/wildtool/package.py | 2 + .../instrumentation/wildtool/utils.py | 17 + .../instrumentation/wildtool/version.py | 1 + .../tests/__init__.py | 0 .../tests/conftest.py | 182 +++++ .../tests/test_agent_span.py | 108 +++ .../tests/test_chain_step_tool_spans.py | 283 ++++++++ .../tests/test_entry_span.py | 115 ++++ .../tests/test_error_scenarios.py | 135 ++++ .../tests/test_instrumentor.py | 20 + .../tests/test_round2_fixes.py | 441 ++++++++++++ 15 files changed, 2230 insertions(+) create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/README.md create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/pyproject.toml create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/__init__.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/_wrappers.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/package.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/utils.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/version.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/__init__.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/conftest.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_agent_span.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_chain_step_tool_spans.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_entry_span.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_error_scenarios.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_instrumentor.py create mode 100644 instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_round2_fixes.py diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/README.md b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/README.md new file mode 100644 index 000000000..1b0499fa4 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/README.md @@ -0,0 +1,55 @@ +# LoongSuite WildToolBench Instrumentation + +OpenTelemetry instrumentation for the [WildToolBench](https://github.com/yupeijei1997/WildToolBench) benchmark framework. + +## Installation + +WildToolBench is not available on PyPI. Install it from source: + +```bash +pip install -e /path/to/WildToolBench/wild-tool-bench +pip install loongsuite-instrumentation-wildtool +``` + +## Requirements + +- **OpenAI provider instrumentation**: To produce LLM spans, you must also enable an OpenAI provider instrumentation (e.g., `opentelemetry-instrumentation-openai` or LoongSuite's equivalent). This plugin creates ENTRY/AGENT/CHAIN/STEP/TOOL spans but does **not** create LLM spans itself. + +## Usage + +```python +from opentelemetry.instrumentation.wildtool import WildToolInstrumentor + +WildToolInstrumentor().instrument() + +# Run WildToolBench as usual — spans are automatically generated. +``` + +## Span Topology + +``` +ENTRY (enter_ai_application_system) +└── AGENT (invoke_agent wildtool) + └── CHAIN (workflow task_{idx}) + └── STEP (react step) + ├── [LLM span — provider instrumentation] + └── TOOL (execute_tool {tool_name}) +``` + +## Patch Points + +| # | Target | Span Type | +|---|--------|-----------| +| P1 | `multi_threaded_inference` | ENTRY | +| P2 | `BaseHandler.inference_multi_turn` | AGENT | +| P3 | `BaseHandler.inference_and_eval_multi_step` | CHAIN + TOOL | +| P4 | `BaseHandler._request_tool_call` | STEP | +| P5 | `BaseHandler._parse_api_response` | (token extraction) | + +## Round 2 fixes (see `llm-dev/execute.md` § "修订记录 (Round 2 fix)") + +- **H1**: TOOL span is now parented on STEP, not CHAIN. Strategy A enhanced — the chain wrapper holds a `round → STEP span` map and uses `trace.set_span_in_context(step_span)` to anchor each post-hoc TOOL span on the matching STEP. STEP `SpanContext`s remain valid parents even after `end()`. +- **H2 (provider-name fallback)**: `opentelemetry-instrumentation-openai-v2 == 0.62b1` only emits the legacy `gen_ai.system` attribute on its LLM span; the new `gen_ai.provider.name` attribute is missing. As a *pure fallback* the wildtool plugin writes both `gen_ai.system="openai"` and `gen_ai.provider.name="openai"` on the **STEP** span (not on the LLM span — that is owned by the OpenAI v2 instrumentation and we do **not** patch it). Once the OpenAI v2 instrumentation upstream emits `gen_ai.provider.name` natively this fallback can be removed. +- **M1**: CHAIN span now carries `input.value` (last user message in `inference_data["messages"]`, truncated to 4096 chars) and `output.value` (JSON of `action_name_label`/`task_idx`/`is_optimal`). +- **M2**: STEP span now carries `gen_ai.react.finish_reason` on error paths. Mapping table is in `execute.md` § "M2: gen_ai.react.finish_reason 取值映射". +- **M3**: TOOL span explicitly writes `gen_ai.tool.call.arguments` / `gen_ai.tool.call.result` / `gen_ai.tool.description`, bypassing `OTEL_INSTRUMENTATION_GENAI_CAPTURE_*` gating in `opentelemetry-util-genai`. The custom `wildtool.tool.execution_mode = "ground_truth_replay"` is preserved. diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/pyproject.toml new file mode 100644 index 000000000..b8f9f44d0 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/pyproject.toml @@ -0,0 +1,66 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "loongsuite-instrumentation-wildtool" +dynamic = ["version"] +description = "LoongSuite WildToolBench Instrumentation" +readme = "README.md" +license = "Apache-2.0" +requires-python = ">=3.9" +authors = [ + { name = "LoongSuite Python Agent Authors", email = "caishipeng.csp@alibaba-inc.com" }, + { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] +dependencies = [ + "opentelemetry-api ~= 1.37", + "opentelemetry-instrumentation >= 0.58b0", + "opentelemetry-semantic-conventions >= 0.58b0", + "opentelemetry-util-genai", + "wrapt >= 1.17.3, < 3.0.0", +] + +[project.optional-dependencies] +instruments = [ + "openai >= 1.0.0", +] + +test = [ + "pytest ~= 8.0", + "pytest-cov ~= 4.1.0", + "pytest-forked >= 1.6.0", + "opentelemetry-sdk >= 1.37", + "openai >= 1.0.0", +] + +[project.entry-points.opentelemetry_instrumentor] +wildtool = "opentelemetry.instrumentation.wildtool:WildToolInstrumentor" + +[project.urls] +Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-wildtool" +Repository = "https://github.com/alibaba/loongsuite-python-agent" + +[tool.hatch.version] +path = "src/opentelemetry/instrumentation/wildtool/version.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/opentelemetry"] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/__init__.py new file mode 100644 index 000000000..dad772500 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/__init__.py @@ -0,0 +1,161 @@ +"""OpenTelemetry WildToolBench Instrumentation""" + +import logging +from typing import Any, Collection + +from wrapt import wrap_function_wrapper + +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor +from opentelemetry.instrumentation.utils import unwrap +from opentelemetry.instrumentation.wildtool.package import _instruments +from opentelemetry.instrumentation.wildtool.version import __version__ +from opentelemetry.instrumentation.wildtool._wrappers import ( + WildToolAgentWrapper, + WildToolChainWrapper, + WildToolEntryWrapper, + WildToolParseWrapper, + WildToolRequestWrapper, +) +from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler + +logger = logging.getLogger(__name__) + +_LLM_RESPONSE_GEN_MODULE = "wtb._llm_response_generation" +_BASE_HANDLER_MODULE = "wtb.model_handler.base_handler" + +__all__ = ["WildToolInstrumentor", "__version__"] + + +class WildToolInstrumentor(BaseInstrumentor): + """OpenTelemetry instrumentor for WildToolBench framework.""" + + def __init__(self): + super().__init__() + self._handler = None + # Track concrete handler subclasses whose abstract _request_tool_call / + # _parse_api_response we have already wrapped, so we can unwrap on + # uninstrument and avoid double-wrapping. + self._patched_handler_classes: set = set() + self._request_wrapper = None + self._parse_wrapper = None + + def instrumentation_dependencies(self) -> Collection[str]: + return _instruments + + def _instrument(self, **kwargs: Any) -> None: + tracer_provider = kwargs.get("tracer_provider") + meter_provider = kwargs.get("meter_provider") + logger_provider = kwargs.get("logger_provider") + + self._handler = ExtendedTelemetryHandler( + tracer_provider=tracer_provider, + meter_provider=meter_provider, + logger_provider=logger_provider, + ) + self._request_wrapper = WildToolRequestWrapper(self._handler) + self._parse_wrapper = WildToolParseWrapper(self._handler) + + # P1: ENTRY span + try: + wrap_function_wrapper( + _LLM_RESPONSE_GEN_MODULE, + "multi_threaded_inference", + WildToolEntryWrapper(self._handler), + ) + except Exception as e: + logger.warning("Failed to instrument multi_threaded_inference: %s", e) + + # P2: AGENT span + try: + wrap_function_wrapper( + _BASE_HANDLER_MODULE, + "BaseHandler.inference_multi_turn", + WildToolAgentWrapper(self._handler), + ) + except Exception as e: + logger.warning("Failed to instrument inference_multi_turn: %s", e) + + # P3: CHAIN span (+ STEP + TOOL management). + # The chain wrapper also lazily patches the concrete subclass' + # `_request_tool_call` / `_parse_api_response` on first use, so that + # subclasses overriding the abstract base methods are still + # intercepted (P4 / P5). + try: + wrap_function_wrapper( + _BASE_HANDLER_MODULE, + "BaseHandler.inference_and_eval_multi_step", + WildToolChainWrapper(self._handler, self), + ) + except Exception as e: + logger.warning( + "Failed to instrument inference_and_eval_multi_step: %s", e + ) + + def ensure_handler_class_patched(self, handler_cls) -> None: + """Lazily wrap the concrete handler subclass' P4/P5 methods. + + WildToolBench declares ``_request_tool_call`` and ``_parse_api_response`` + as abstract on ``BaseHandler``, but real handlers (and tests) override + them. Python method resolution dispatches directly to the override and + therefore never reaches a wrapper installed on the base class. We + instead wrap the override on first invocation per subclass. + """ + if handler_cls in self._patched_handler_classes: + return + self._patched_handler_classes.add(handler_cls) + + module_name = handler_cls.__module__ + cls_name = handler_cls.__name__ + for method, wrapper in ( + ("_request_tool_call", self._request_wrapper), + ("_parse_api_response", self._parse_wrapper), + ): + if method not in handler_cls.__dict__: + continue + try: + wrap_function_wrapper( + module_name, + f"{cls_name}.{method}", + wrapper, + ) + except Exception as e: + logger.debug( + "Failed to wrap %s.%s.%s: %s", + module_name, + cls_name, + method, + e, + ) + + def _uninstrument(self, **kwargs: Any) -> None: + try: + import wtb._llm_response_generation as llm_gen + + unwrap(llm_gen, "multi_threaded_inference") + except Exception as e: + logger.debug("Failed to uninstrument multi_threaded_inference: %s", e) + + try: + import wtb.model_handler.base_handler as bh + + unwrap(bh.BaseHandler, "inference_multi_turn") + unwrap(bh.BaseHandler, "inference_and_eval_multi_step") + except Exception as e: + logger.debug("Failed to uninstrument BaseHandler methods: %s", e) + + for cls in list(self._patched_handler_classes): + for method in ("_request_tool_call", "_parse_api_response"): + if method in cls.__dict__: + try: + unwrap(cls, method) + except Exception as e: + logger.debug( + "Failed to unwrap %s.%s: %s", + cls.__name__, + method, + e, + ) + self._patched_handler_classes.clear() + self._request_wrapper = None + self._parse_wrapper = None + self._handler = None diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/_wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/_wrappers.py new file mode 100644 index 000000000..612a332ab --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/_wrappers.py @@ -0,0 +1,644 @@ +"""Wrapper classes for WildToolBench instrumentation. + +Each wrapper corresponds to one patch point and manages the lifecycle +of one or more span types. + +Round 2 fix highlights (see ``llm-dev/execute.md`` § "修订记录 (Round 2 fix)"): + +H1 + TOOL span parent is now STEP rather than CHAIN. Each STEP invocation is + appended to a per-chain list in :data:`_chain_step_invocations`; when the + chain wrapper post-processes ``inference_log`` it looks up the matching + STEP span by ``round`` and uses + :func:`opentelemetry.trace.set_span_in_context` so ``start_execute_tool`` + parents the TOOL span on the STEP context (even if STEP is already + closed — its :class:`SpanContext` remains a valid parent reference). + +H2 + The OpenAI v2 provider instrumentation (0.62b1) writes only the legacy + ``gen_ai.system`` attribute to its LLM span. The wildtool plugin now + writes both ``gen_ai.system`` and ``gen_ai.provider.name`` on the STEP + span as a fallback so the new semantic-conventions attribute is present + in the trace tree even before the upstream OpenAI v2 instrumentation + catches up. We do **not** patch the OpenAI v2 instrumentation itself. + +M1 + ``input.value`` (last user message in the chain's ``messages``, truncated + to 4096 chars) and ``output.value`` (a JSON of action label, task index + and is_optimal) are written on the CHAIN span. + +M2 + ``gen_ai.react.finish_reason`` is derived from ``inference_log`` on the + *last* (currently active) STEP. Mappings: + + ``"parse_tool_calls_failed"`` + ``error_reason`` contains "parse tool_calls failed". + ``"action_name_mismatch"`` + ``error_reason`` contains "action name not in candidate". + ``"empty_response"`` + ``error_reason`` contains "tool_calls and content are None". + ``"error"`` + request raised an exception (handled in + :class:`WildToolRequestWrapper`). + +M3 + ``gen_ai.tool.call.arguments``, ``gen_ai.tool.call.result`` and + ``gen_ai.tool.description`` are written explicitly on TOOL spans + *before* close as a fallback. ``opentelemetry-util-genai`` gates these + sensitive attributes behind ``OTEL_INSTRUMENTATION_GENAI_CAPTURE_*`` env + vars; the wildtool plugin always writes them since wtb data is + benchmark-synthetic and never PII. +""" + +import json +import logging +from contextvars import ContextVar +from typing import List, Optional + +from opentelemetry.trace import StatusCode, set_span_in_context +from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler +from opentelemetry.util.genai.extended_types import ( + EntryInvocation, + ExecuteToolInvocation, + InvokeAgentInvocation, + ReactStepInvocation, +) +from opentelemetry.util.genai.types import Error + +logger = logging.getLogger(__name__) + +# ─────────────────────────── ContextVars ─────────────────────────────── +# The CHAIN wrapper opens a new logical "chain" by flipping ``_in_chain`` +# and resetting the counter. The REQUEST wrapper reads these to decide +# whether to create a STEP span and what round number to assign. +_in_chain: ContextVar[bool] = ContextVar("_wt_in_chain", default=False) + +# Currently open STEP invocation. Used by the parse wrapper to attach +# token attributes to the right span. +_step_invocation: ContextVar[Optional[ReactStepInvocation]] = ContextVar( + "_wt_step_inv", default=None +) +_step_counter: ContextVar[int] = ContextVar("_wt_step_ctr", default=0) + +# Per-chain list of every STEP invocation created in the current chain +# (in `round` order). The chain wrapper allocates this list on entry and +# uses it after ``wrapped`` returns to re-parent TOOL spans onto the +# matching STEP. Even if a STEP span is already ``end()``-ed, its +# :class:`SpanContext` stays valid as a parent reference for new spans. +_chain_step_invocations: ContextVar[Optional[List[ReactStepInvocation]]] = ( + ContextVar("_wt_chain_step_invs", default=None) +) + +_PROVIDER_FALLBACK_NAME = "openai" +_INPUT_VALUE_MAX_CHARS = 4096 + + +def _close_active_step(handler: ExtendedTelemetryHandler) -> None: + """Close the currently active STEP span, if any.""" + prev = _step_invocation.get() + if prev is not None: + try: + handler.stop_react_step(prev) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to close step: %s", e) + _step_invocation.set(None) + + +def _truncate(text: str, max_chars: int) -> str: + if len(text) <= max_chars: + return text + return text[:max_chars] + "...(truncated)" + + +def _stringify(value) -> str: + if isinstance(value, str): + return value + try: + return json.dumps(value, ensure_ascii=False) + except (TypeError, ValueError): + return str(value) + + +class WildToolEntryWrapper: + """P1: Wraps multi_threaded_inference → ENTRY span.""" + + def __init__(self, handler: ExtendedTelemetryHandler): + self._handler = handler + + def __call__(self, wrapped, instance, args, kwargs): + # Signature: multi_threaded_inference(handler, model_name, test_case). + # We only need model_name and test_case for ENTRY attributes; the + # handler instance flows through as args[0] untouched. + model_name = args[1] if len(args) > 1 else kwargs.get("model_name", "") + test_case = args[2] if len(args) > 2 else kwargs.get("test_case", {}) + + invocation = EntryInvocation( + session_id=test_case.get("id"), + attributes={ + "gen_ai.framework": "wildtool", + "gen_ai.request.model": model_name, + "wildtool.turn_count": len(test_case.get("english_tasks", [])), + }, + ) + self._handler.start_entry(invocation) + try: + result = wrapped(*args, **kwargs) + self._handler.stop_entry(invocation) + return result + except Exception as e: + self._handler.fail_entry( + invocation, Error(message=str(e), type=type(e)) + ) + raise + + +class WildToolAgentWrapper: + """P2: Wraps BaseHandler.inference_multi_turn → AGENT span.""" + + def __init__(self, handler: ExtendedTelemetryHandler): + self._handler = handler + + def __call__(self, wrapped, instance, args, kwargs): + test_entry = args[0] if args else kwargs.get("test_entry", {}) + + invocation = InvokeAgentInvocation( + provider=None, + agent_name=type(instance).__name__, + conversation_id=test_entry.get("id"), + request_model=getattr(instance, "model_name", None), + attributes={ + "gen_ai.framework": "wildtool", + "wildtool.turn_count": len( + test_entry.get("english_answer_list", []) + ), + }, + ) + self._handler.start_invoke_agent(invocation) + try: + result = wrapped(*args, **kwargs) + total_input = 0 + total_output = 0 + for task_result in (result or []): + if isinstance(task_result, dict): + total_input += sum( + task_result.get("input_token_count", []) + ) + total_output += sum( + task_result.get("output_token_count", []) + ) + if total_input: + invocation.input_tokens = total_input + if total_output: + invocation.output_tokens = total_output + self._handler.stop_invoke_agent(invocation) + return result + except Exception as e: + self._handler.fail_invoke_agent( + invocation, Error(message=str(e), type=type(e)) + ) + raise + + +class WildToolChainWrapper: + """P3: Wraps BaseHandler.inference_and_eval_multi_step → CHAIN span. + + Also manages the lifecycle of the final STEP span and creates TOOL spans + from the returned ``inference_log`` after the original function completes. + Round 2 fixes (H1/M1/M2/M3) are implemented here. + """ + + def __init__(self, handler: ExtendedTelemetryHandler, instrumentor=None): + self._handler = handler + self._instrumentor = instrumentor + + def __call__(self, wrapped, instance, args, kwargs): + if self._instrumentor is not None and instance is not None: + try: + self._instrumentor.ensure_handler_class_patched(type(instance)) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to ensure subclass patched: %s", e) + + inference_data = args[0] if args else kwargs.get("inference_data", {}) + if not isinstance(inference_data, dict): + inference_data = {} + task_idx = inference_data.get("task_idx", 0) + test_entry_id = inference_data.get("test_entry_id", "") + + span_name = f"workflow task_{task_idx}" + tracer = self._handler._tracer + + chain_token = _in_chain.set(True) + counter_token = _step_counter.set(0) + step_token = _step_invocation.set(None) + chain_steps: List[ReactStepInvocation] = [] + chain_steps_token = _chain_step_invocations.set(chain_steps) + + chain_attributes = { + "gen_ai.span.kind": "CHAIN", + "gen_ai.operation.name": "workflow", + "gen_ai.framework": "wildtool", + "wildtool.task_idx": task_idx, + "wildtool.test_entry_id": test_entry_id, + } + + # M1: Capture last user message as ``input.value`` BEFORE running the + # wrapped function (the wtb function mutates ``messages`` in place). + input_value = self._extract_input_value(inference_data) + if input_value is not None: + chain_attributes["input.value"] = input_value + + with tracer.start_as_current_span( + name=span_name, attributes=chain_attributes + ) as span: + try: + result = wrapped(*args, **kwargs) + + # M2: Set finish_reason on the currently active (last) STEP + # BEFORE we close it. Only the terminal step ever carries an + # error finish_reason (every wtb error path triggers `break`). + if isinstance(result, dict): + self._apply_last_step_finish_reason( + result.get("inference_log", {}) + ) + + _close_active_step(self._handler) + + if isinstance(result, dict): + label = result.get("action_name_label", "") + is_optimal = bool(result.get("is_optimal", False)) + span.set_attribute("wildtool.action_name_label", label) + span.set_attribute("wildtool.is_optimal", is_optimal) + + # M1: ``output.value`` summarising chain outcome. + try: + span.set_attribute( + "output.value", + json.dumps( + { + "action_name_label": label, + "task_idx": task_idx, + "is_optimal": is_optimal, + }, + ensure_ascii=False, + ), + ) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to set output.value: %s", e) + + # H1 + M3: re-parent TOOL spans on STEP and force-write + # tool call sensitive attributes. + self._create_tool_spans_from_log( + result.get("inference_log", {}), + inference_data, + chain_steps, + ) + + span.set_status(StatusCode.OK) + return result + except Exception as e: + _close_active_step(self._handler) + span.record_exception(e) + span.set_status(StatusCode.ERROR) + raise + finally: + _chain_step_invocations.reset(chain_steps_token) + _step_counter.reset(counter_token) + _step_invocation.reset(step_token) + _in_chain.reset(chain_token) + + # -- M1 --------------------------------------------------------------- + + @staticmethod + def _extract_input_value(inference_data) -> Optional[str]: + msgs = inference_data.get("messages") if isinstance( + inference_data, dict + ) else None + if not isinstance(msgs, list): + return None + for m in reversed(msgs): + if not isinstance(m, dict) or m.get("role") != "user": + continue + content = m.get("content") + if content is None: + continue + text = _stringify(content) + return _truncate(text, _INPUT_VALUE_MAX_CHARS) + return None + + # -- M2 --------------------------------------------------------------- + + def _apply_last_step_finish_reason(self, inference_log) -> None: + if not isinstance(inference_log, dict): + return + current_step = _step_invocation.get() + if current_step is None or current_step.round is None: + return + step_key = f"step_{current_step.round - 1}" + step_data = inference_log.get(step_key) + if not isinstance(step_data, dict): + return + output = step_data.get("inference_output") or {} + if not isinstance(output, dict): + return + label = output.get("current_action_name_label") + error_reason = output.get("error_reason") or "" + reason = self._derive_step_finish_reason(label, error_reason) + if reason is None: + return + # Setting `invocation.finish_reason` is enough — the util-genai + # `_apply_react_step_finish_attributes` writes + # ``gen_ai.react.finish_reason`` from this field on stop. + current_step.finish_reason = reason + + @staticmethod + def _derive_step_finish_reason( + label, error_reason: str + ) -> Optional[str]: + """Map wtb inference_log error_reason → gen_ai.react.finish_reason.""" + if label != "error": + return None + if "parse tool_calls failed" in error_reason: + return "parse_tool_calls_failed" + if "action name not in candidate" in error_reason: + return "action_name_mismatch" + if "tool_calls and content are None" in error_reason: + return "empty_response" + return "error" + + # -- H1 + M3 ---------------------------------------------------------- + + def _create_tool_spans_from_log( + self, + inference_log, + inference_data, + chain_steps: List[ReactStepInvocation], + ) -> None: + """Post-hoc TOOL span creation from inference_log. + + Uses the per-chain STEP invocation list to parent each TOOL span on + the matching STEP span (H1). Sensitive tool-call attributes are + written explicitly on the span (M3) so they appear regardless of + ``OTEL_INSTRUMENTATION_GENAI_CAPTURE_*`` settings. + """ + if not isinstance(inference_log, dict): + return + + # round → SpanContext-bearing OTel context for parenting + step_ctx_by_round = {} + for step_inv in chain_steps: + if step_inv.round is None or step_inv.span is None: + continue + try: + step_ctx_by_round[step_inv.round] = set_span_in_context( + step_inv.span + ) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to compute step parent context: %s", e) + + # tool name → description (for gen_ai.tool.description) + tool_desc_map = {} + tools = inference_data.get("tools") if isinstance( + inference_data, dict + ) else None + if isinstance(tools, list): + for tool in tools: + if not isinstance(tool, dict): + continue + func = tool.get("function") or tool + if not isinstance(func, dict): + continue + name = func.get("name") + desc = func.get("description") + if name: + tool_desc_map[name] = desc + + # Extract tool observations from final messages keyed by tool_call_id; + # wtb only embeds them in messages (not in inference_answer) for the + # tool_call branch. + observation_by_call_id = {} + messages = inference_data.get("messages") if isinstance( + inference_data, dict + ) else None + if isinstance(messages, list): + for msg in messages: + if not isinstance(msg, dict) or msg.get("role") != "tool": + continue + tid = msg.get("tool_call_id") + if tid is None: + continue + content = msg.get("content") + if content is None: + continue + observation_by_call_id[tid] = ( + content if isinstance(content, str) else _stringify(content) + ) + + for key in sorted(k for k in inference_log if k.startswith("step_")): + try: + step_idx = int(key[len("step_"):]) + except ValueError: + continue + round_num = step_idx + 1 + + step_data = inference_log[key] + if not isinstance(step_data, dict): + continue + output = step_data.get("inference_output") or {} + if not isinstance(output, dict): + continue + tool_calls = output.get("tool_calls") + label = output.get("current_action_name_label") + if not tool_calls or label != "correct": + continue + + answer_data = step_data.get("inference_answer") or {} + candidate = ( + answer_data.get("candidate_0_answer_function_list") + if isinstance(answer_data, dict) + else None + ) or {} + candidate_observation = ( + candidate.get("observation") + if isinstance(candidate, dict) + else None + ) + + parent_ctx = step_ctx_by_round.get(round_num) + + for tc in tool_calls: + if not isinstance(tc, dict): + continue + func = tc.get("function") or {} + if not isinstance(func, dict): + func = {} + tool_name = func.get("name", "unknown") + tool_id = tc.get("id") + tool_args_raw = func.get("arguments", "") + tool_args_str = ( + tool_args_raw + if isinstance(tool_args_raw, str) + else _stringify(tool_args_raw) + ) + + observation_str: Optional[str] = None + if tool_id is not None and tool_id in observation_by_call_id: + observation_str = observation_by_call_id[tool_id] + elif candidate_observation is not None: + observation_str = ( + candidate_observation + if isinstance(candidate_observation, str) + else _stringify(candidate_observation) + ) + + description = tool_desc_map.get(tool_name) + + invocation = ExecuteToolInvocation( + tool_name=tool_name, + tool_call_id=tool_id, + tool_call_arguments=tool_args_str, + tool_call_result=observation_str, + tool_type="function", + tool_description=description, + attributes={ + "wildtool.tool.execution_mode": "ground_truth_replay", + }, + ) + + try: + self._handler.start_execute_tool( + invocation, context=parent_ctx + ) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to start_execute_tool: %s", e) + continue + + # M3: explicitly write tool_call sensitive attrs. The + # util-genai `_get_tool_call_data_attributes` helper guards + # these behind experimental-mode + content-capture-mode env + # vars which are not always set in real deployments. + tool_span = invocation.span + if tool_span is not None and tool_span.is_recording(): + try: + tool_span.set_attribute( + "gen_ai.tool.call.arguments", tool_args_str + ) + if observation_str is not None: + tool_span.set_attribute( + "gen_ai.tool.call.result", observation_str + ) + if description: + tool_span.set_attribute( + "gen_ai.tool.description", description + ) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to set tool span attrs: %s", e) + + try: + self._handler.stop_execute_tool(invocation) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to stop_execute_tool: %s", e) + + +class WildToolRequestWrapper: + """P4: Wraps BaseHandler._request_tool_call. + + Creates STEP span (ReactStepInvocation) before each LLM call. + Extracts latency from return value. Also writes the H2 provider-name + fallback attributes (``gen_ai.system`` + ``gen_ai.provider.name``) on + the STEP span so the new semconv attribute is present in the trace + even when the upstream OpenAI v2 instrumentation only emits the legacy + ``gen_ai.system``. + """ + + def __init__(self, handler: ExtendedTelemetryHandler): + self._handler = handler + + def __call__(self, wrapped, instance, args, kwargs): + if not _in_chain.get(): + return wrapped(*args, **kwargs) + + # Close the previous step (the natural end-of-step is when the next + # request fires). The STEP span's SpanContext stays valid as a + # parent for TOOL spans created later. + _close_active_step(self._handler) + + step_num = _step_counter.get() + 1 + _step_counter.set(step_num) + + step_inv = ReactStepInvocation(round=step_num) + try: + self._handler.start_react_step(step_inv) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to start react step: %s", e) + return wrapped(*args, **kwargs) + + # H2: provider-name fallback attributes. Written on the STEP, not + # on the LLM span, because the LLM span is owned by the OpenAI v2 + # provider instrumentation and is created lazily inside the wtb + # request implementation. + if step_inv.span is not None and step_inv.span.is_recording(): + try: + step_inv.span.set_attribute( + "gen_ai.system", _PROVIDER_FALLBACK_NAME + ) + step_inv.span.set_attribute( + "gen_ai.provider.name", _PROVIDER_FALLBACK_NAME + ) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to set provider fallback attrs: %s", e) + + # Track this step for H1 TOOL re-parenting. + chain_steps = _chain_step_invocations.get() + if chain_steps is not None: + chain_steps.append(step_inv) + _step_invocation.set(step_inv) + + try: + result = wrapped(*args, **kwargs) + if isinstance(result, tuple) and len(result) == 2: + _, latency = result + if step_inv.span and step_inv.span.is_recording(): + try: + step_inv.span.set_attribute( + "wildtool.latency", float(latency) + ) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to set wildtool.latency: %s", e) + return result + except Exception as e: + step_inv.finish_reason = "error" + self._handler.fail_react_step( + step_inv, Error(message=str(e), type=type(e)) + ) + _step_invocation.set(None) + raise + + +class WildToolParseWrapper: + """P5: Wraps BaseHandler._parse_api_response. + + Extracts token counts from parsed response and sets them on the + current STEP span as attributes. + """ + + def __init__(self, handler: ExtendedTelemetryHandler): + self._handler = handler + + def __call__(self, wrapped, instance, args, kwargs): + result = wrapped(*args, **kwargs) + + step_inv = _step_invocation.get() + if step_inv and step_inv.span and step_inv.span.is_recording(): + if isinstance(result, dict): + input_t = result.get("input_token") + output_t = result.get("output_token") + if input_t is not None: + step_inv.span.set_attribute( + "gen_ai.usage.input_tokens", input_t + ) + if output_t is not None: + step_inv.span.set_attribute( + "gen_ai.usage.output_tokens", output_t + ) + + return result diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/package.py new file mode 100644 index 000000000..1ac5bcfee --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/package.py @@ -0,0 +1,2 @@ +_instruments = ("openai >= 1.0.0",) +_supports_metrics = False diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/utils.py new file mode 100644 index 000000000..c26b7711d --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/utils.py @@ -0,0 +1,17 @@ +"""Utility functions for WildToolBench instrumentation.""" + +import json +from typing import Any, Optional + + +def safe_json_dumps(obj: Any, max_length: int = 10000) -> Optional[str]: + """Safely serialize object to JSON string with length limit.""" + if obj is None: + return None + try: + s = json.dumps(obj, ensure_ascii=False) + if len(s) > max_length: + return s[:max_length] + "...(truncated)" + return s + except (TypeError, ValueError): + return str(obj)[:max_length] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/version.py new file mode 100644 index 000000000..3dc1f76bc --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/version.py @@ -0,0 +1 @@ +__version__ = "0.1.0" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/conftest.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/conftest.py new file mode 100644 index 000000000..014186185 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/conftest.py @@ -0,0 +1,182 @@ +"""Test configuration for WildToolBench instrumentation tests.""" + +import json +import os + +import pytest + +os.environ.setdefault("OPENAI_API_KEY", "test_key_not_real") +os.environ.setdefault("OPENAI_BASE_URL", "http://localhost:9999/v1") + +from opentelemetry.instrumentation.wildtool import WildToolInstrumentor +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import SimpleSpanProcessor +from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( + InMemorySpanExporter, +) + + +def pytest_configure(config: pytest.Config): + os.environ["OTEL_SEMCONV_STABILITY_OPT_IN"] = "gen_ai_latest_experimental" + + +@pytest.fixture(scope="function", name="span_exporter") +def fixture_span_exporter(): + exporter = InMemorySpanExporter() + yield exporter + + +@pytest.fixture(scope="function", name="tracer_provider") +def fixture_tracer_provider(span_exporter): + provider = TracerProvider() + provider.add_span_processor(SimpleSpanProcessor(span_exporter)) + return provider + + +@pytest.fixture(scope="function") +def instrument(tracer_provider): + instrumentor = WildToolInstrumentor() + instrumentor.instrument( + tracer_provider=tracer_provider, + skip_dep_check=True, + ) + yield instrumentor + instrumentor.uninstrument() + + +# ==================== Minimal test data fixtures ==================== + + +def _make_chat_completion_response( + content=None, + tool_calls=None, + input_tokens=10, + output_tokens=5, + model="gpt-4o", +): + """Build a minimal ChatCompletion-like dict that can be JSON-serialized.""" + message = {"role": "assistant", "content": content or ""} + if tool_calls: + message["tool_calls"] = tool_calls + return { + "id": "chatcmpl-test", + "object": "chat.completion", + "model": model, + "choices": [{"index": 0, "message": message, "finish_reason": "stop"}], + "usage": { + "prompt_tokens": input_tokens, + "completion_tokens": output_tokens, + "total_tokens": input_tokens + output_tokens, + }, + } + + +class FakeChatCompletion: + """Mimics openai.types.chat.ChatCompletion enough for _parse_api_response.""" + + def __init__(self, data: dict): + self._data = data + + def json(self): + return json.dumps(self._data) + + def __getattr__(self, name): + return self._data[name] + + +@pytest.fixture() +def make_completion(): + """Factory fixture to build FakeChatCompletion objects.""" + + def _factory(**kwargs): + return FakeChatCompletion(_make_chat_completion_response(**kwargs)) + + return _factory + + +@pytest.fixture() +def simple_test_entry(): + """A minimal WildToolBench test_entry with 1 task, 1 step (prepare_to_answer).""" + return { + "id": "wild_tool_bench_test_001", + "english_env_info": "2025-01-01", + "english_tools": [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather for a city", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string"}, + }, + "required": ["city"], + }, + }, + } + ], + "english_tasks": ["What is the weather in Beijing?"], + "english_answer_list": [ + [ + { + "action": { + "name": "get_weather", + "arguments": {"city": "Beijing"}, + }, + "observation": "Sunny, 25°C", + "dependency_list": [], + }, + { + "action": { + "name": "prepare_to_answer", + "arguments": {}, + }, + "observation": "The weather in Beijing is Sunny, 25°C", + "dependency_list": [0], + }, + ] + ], + } + + +@pytest.fixture() +def tool_call_response_factory(): + """Factory to make tool_call ChatCompletion responses.""" + + def _factory(tool_name, arguments, tool_call_id="call_001"): + tc = [ + { + "id": tool_call_id, + "type": "function", + "function": { + "name": tool_name, + "arguments": ( + json.dumps(arguments) + if isinstance(arguments, dict) + else arguments + ), + }, + } + ] + return FakeChatCompletion( + _make_chat_completion_response(tool_calls=tc) + ) + + return _factory + + +@pytest.fixture() +def text_response_factory(): + """Factory to make text-only ChatCompletion responses.""" + + def _factory(content, input_tokens=10, output_tokens=5): + return FakeChatCompletion( + _make_chat_completion_response( + content=content, + input_tokens=input_tokens, + output_tokens=output_tokens, + ) + ) + + return _factory diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_agent_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_agent_span.py new file mode 100644 index 000000000..2929eeb33 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_agent_span.py @@ -0,0 +1,108 @@ +"""Tests for AGENT span (P2: inference_multi_turn).""" + +import json +from unittest.mock import patch + +import pytest +from opentelemetry.trace import StatusCode + +from wtb.model_handler.base_handler import BaseHandler + + +class _StubHandler(BaseHandler): + """Minimal handler subclass for testing AGENT span.""" + + def __init__(self): + super().__init__("test-model", 0.0) + self._step_responses = [] + self._step_idx = 0 + + def _request_tool_call(self, inference_data): + resp = self._step_responses[self._step_idx] + self._step_idx += 1 + return resp, 0.1 + + def _parse_api_response(self, api_response): + data = json.loads(api_response.json()) + choice = data["choices"][0] + message = choice["message"] + return { + "reasoning_content": None, + "content": message.get("content"), + "tool_calls": message.get("tool_calls"), + "input_token": data["usage"]["prompt_tokens"], + "output_token": data["usage"]["completion_tokens"], + } + + +class TestAgentSpan: + def test_agent_span_attributes( + self, span_exporter, instrument, simple_test_entry, make_completion, + tool_call_response_factory, text_response_factory, + ): + """AGENT span should exist with correct attributes and token aggregation.""" + handler = _StubHandler() + + # Step 0: model returns tool call for get_weather + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + # Step 1: model returns text (prepare_to_answer match) + resp1 = text_response_factory( + "The weather in Beijing is Sunny, 25°C", + input_tokens=20, output_tokens=15, + ) + handler._step_responses = [resp0, resp1] + + result = handler.inference_multi_turn(simple_test_entry) + assert result is not None + + spans = span_exporter.get_finished_spans() + agent_spans = [s for s in spans if "invoke_agent" in s.name] + assert len(agent_spans) == 1 + + span = agent_spans[0] + assert span.name == "invoke_agent _StubHandler" + attrs = dict(span.attributes or {}) + assert attrs.get("gen_ai.span.kind") == "AGENT" + assert attrs.get("gen_ai.operation.name") == "invoke_agent" + assert attrs.get("gen_ai.framework") == "wildtool" + assert attrs.get("gen_ai.agent.name") == "_StubHandler" + assert attrs.get("gen_ai.conversation.id") == "wild_tool_bench_test_001" + assert attrs.get("gen_ai.request.model") == "test-model" + assert attrs.get("wildtool.turn_count") == 1 + + assert attrs.get("gen_ai.usage.input_tokens") == 30 + assert attrs.get("gen_ai.usage.output_tokens") == 20 + + def test_agent_parent_is_entry( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """When called via multi_threaded_inference, AGENT span should be child of ENTRY.""" + from wtb._llm_response_generation import multi_threaded_inference + + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + test_case = simple_test_entry.copy() + multi_threaded_inference(handler, "test-model", test_case) + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans if s.name == "enter_ai_application_system" + ] + agent_spans = [s for s in spans if "invoke_agent" in s.name] + + assert len(entry_spans) == 1 + assert len(agent_spans) == 1 + + entry = entry_spans[0] + agent = agent_spans[0] + assert agent.context.trace_id == entry.context.trace_id + assert agent.parent is not None + assert agent.parent.span_id == entry.context.span_id diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_chain_step_tool_spans.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_chain_step_tool_spans.py new file mode 100644 index 000000000..d7dd7b4aa --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_chain_step_tool_spans.py @@ -0,0 +1,283 @@ +"""Tests for CHAIN / STEP / TOOL spans (P3, P4, P5).""" + +import json + +import pytest +from opentelemetry.trace import StatusCode + +from wtb.model_handler.base_handler import BaseHandler + + +class _StubHandler(BaseHandler): + """Minimal handler subclass with controllable responses.""" + + def __init__(self): + super().__init__("test-model", 0.0) + self._step_responses = [] + self._step_idx = 0 + + def _request_tool_call(self, inference_data): + resp = self._step_responses[self._step_idx] + self._step_idx += 1 + return resp, 0.05 + + def _parse_api_response(self, api_response): + data = json.loads(api_response.json()) + choice = data["choices"][0] + message = choice["message"] + return { + "reasoning_content": None, + "content": message.get("content"), + "tool_calls": message.get("tool_calls"), + "input_token": data["usage"]["prompt_tokens"], + "output_token": data["usage"]["completion_tokens"], + } + + +class TestChainSpan: + def test_chain_span_per_task( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """Each task should produce one CHAIN span with correct attributes.""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + chain_spans = [s for s in spans if s.name.startswith("workflow")] + assert len(chain_spans) == 1 + + chain = chain_spans[0] + assert chain.name == "workflow task_0" + attrs = dict(chain.attributes or {}) + assert attrs.get("gen_ai.span.kind") == "CHAIN" + assert attrs.get("gen_ai.operation.name") == "workflow" + assert attrs.get("gen_ai.framework") == "wildtool" + assert attrs.get("wildtool.task_idx") == 0 + assert attrs.get("wildtool.test_entry_id") == "wild_tool_bench_test_001" + assert attrs.get("wildtool.action_name_label") == "correct" + assert attrs.get("wildtool.is_optimal") is True + + def test_chain_parent_is_agent( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """CHAIN span should be child of AGENT span.""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + agent_spans = [s for s in spans if "invoke_agent" in s.name] + chain_spans = [s for s in spans if s.name.startswith("workflow")] + + assert len(agent_spans) == 1 + assert len(chain_spans) == 1 + + agent = agent_spans[0] + chain = chain_spans[0] + assert chain.context.trace_id == agent.context.trace_id + assert chain.parent is not None + assert chain.parent.span_id == agent.context.span_id + + +class TestStepSpans: + def test_step_spans_per_chain( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """Each _request_tool_call invocation should produce a STEP span.""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + step_spans = [s for s in spans if s.name == "react step"] + assert len(step_spans) == 2 + + attrs0 = dict(step_spans[0].attributes or {}) + attrs1 = dict(step_spans[1].attributes or {}) + rounds = sorted([attrs0.get("gen_ai.react.round"), attrs1.get("gen_ai.react.round")]) + assert rounds == [1, 2] + + for ss in step_spans: + a = dict(ss.attributes or {}) + assert a.get("gen_ai.span.kind") == "STEP" + assert a.get("gen_ai.operation.name") == "react" + + def test_step_parent_is_chain( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """STEP spans should be children of CHAIN span.""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + chain_spans = [s for s in spans if s.name.startswith("workflow")] + step_spans = [s for s in spans if s.name == "react step"] + + assert len(chain_spans) == 1 + chain = chain_spans[0] + + for ss in step_spans: + assert ss.context.trace_id == chain.context.trace_id + assert ss.parent is not None + assert ss.parent.span_id == chain.context.span_id + + def test_step_token_attributes( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """STEP span should have gen_ai.usage.input_tokens and output_tokens.""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory( + "The weather in Beijing is Sunny, 25°C", + input_tokens=25, output_tokens=12, + ) + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + step_spans = sorted( + [s for s in spans if s.name == "react step"], + key=lambda s: s.attributes.get("gen_ai.react.round", 0), + ) + assert len(step_spans) == 2 + + # First step: default 10 input, 5 output from make_completion defaults + a0 = dict(step_spans[0].attributes or {}) + assert a0.get("gen_ai.usage.input_tokens") == 10 + assert a0.get("gen_ai.usage.output_tokens") == 5 + + # Second step: 25 input, 12 output + a1 = dict(step_spans[1].attributes or {}) + assert a1.get("gen_ai.usage.input_tokens") == 25 + assert a1.get("gen_ai.usage.output_tokens") == 12 + + +class TestToolSpans: + def test_tool_span_attributes( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """TOOL span should have correct attributes including execution_mode.""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + tool_spans = [s for s in spans if "execute_tool" in s.name] + assert len(tool_spans) == 1 + + tool = tool_spans[0] + assert tool.name == "execute_tool get_weather" + attrs = dict(tool.attributes or {}) + assert attrs.get("gen_ai.span.kind") == "TOOL" + assert attrs.get("gen_ai.operation.name") == "execute_tool" + assert attrs.get("gen_ai.tool.name") == "get_weather" + assert attrs.get("gen_ai.tool.type") == "function" + assert ( + attrs.get("wildtool.tool.execution_mode") == "ground_truth_replay" + ) + + def test_tool_span_parent_is_chain( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """TOOL spans share the CHAIN trace_id (parent is STEP after Round 2).""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + chain_spans = [s for s in spans if s.name.startswith("workflow")] + tool_spans = [s for s in spans if "execute_tool" in s.name] + + assert len(chain_spans) == 1 + assert len(tool_spans) >= 1 + + chain = chain_spans[0] + for ts in tool_spans: + assert ts.context.trace_id == chain.context.trace_id + + +class TestSpanHierarchy: + def test_full_hierarchy( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """Verify ENTRY → AGENT → CHAIN → STEP hierarchy and consistent trace_id.""" + from wtb._llm_response_generation import multi_threaded_inference + + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + test_case = simple_test_entry.copy() + multi_threaded_inference(handler, "test-model", test_case) + + spans = span_exporter.get_finished_spans() + + entry = [s for s in spans if s.name == "enter_ai_application_system"] + agent = [s for s in spans if "invoke_agent" in s.name] + chain = [s for s in spans if s.name.startswith("workflow")] + step = [s for s in spans if s.name == "react step"] + tool = [s for s in spans if "execute_tool" in s.name] + + assert len(entry) == 1 + assert len(agent) == 1 + assert len(chain) == 1 + assert len(step) == 2 + assert len(tool) >= 1 + + trace_id = entry[0].context.trace_id + for s in spans: + assert s.context.trace_id == trace_id + + # AGENT parent = ENTRY + assert agent[0].parent.span_id == entry[0].context.span_id + # CHAIN parent = AGENT + assert chain[0].parent.span_id == agent[0].context.span_id + # STEP parent = CHAIN + for s in step: + assert s.parent.span_id == chain[0].context.span_id diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_entry_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_entry_span.py new file mode 100644 index 000000000..834e7dd13 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_entry_span.py @@ -0,0 +1,115 @@ +"""Tests for ENTRY span (P1: multi_threaded_inference). + +Module-level imports of ``wtb._llm_response_generation.multi_threaded_inference`` +must be avoided: ``wrapt.wrap_function_wrapper`` patches the attribute on the +module, but a pre-imported local binding still references the original +unwrapped function. All tests therefore import the symbol lazily after the +``instrument`` fixture has run. +""" + +import pytest +from opentelemetry.trace import StatusCode + +from wtb.model_handler.base_handler import BaseHandler + + +class _StubHandler(BaseHandler): + """Minimal handler subclass for testing. + + Overrides ``inference`` so the multi_threaded_inference wrapper invokes a + deterministic, side-effect-free body that returns a fake result dict and + therefore exercises only the ENTRY span codepath. + """ + + def __init__(self): + super().__init__("test-model", 0.0) + + def _request_tool_call(self, inference_data): + raise NotImplementedError + + def _parse_api_response(self, api_response): + raise NotImplementedError + + def inference(self, test_entry): + return [ + { + "action_name_label": "correct", + "is_optimal": True, + "inference_log": {}, + "latency": [0.1], + "input_token_count": [10], + "output_token_count": [5], + } + ] + + +class TestEntrySpan: + def test_entry_span_created(self, span_exporter, instrument): + """ENTRY span should be created with correct attributes.""" + from wtb._llm_response_generation import multi_threaded_inference + + handler = _StubHandler() + test_case = { + "id": "wild_tool_bench_test_001", + "english_tasks": ["task1", "task2"], + } + + result = multi_threaded_inference(handler, "gpt-4o", test_case) + + assert result is not None + assert result["id"] == "wild_tool_bench_test_001" + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans if s.name == "enter_ai_application_system" + ] + assert len(entry_spans) == 1 + + span = entry_spans[0] + attrs = dict(span.attributes or {}) + assert attrs.get("gen_ai.span.kind") == "ENTRY" + assert attrs.get("gen_ai.operation.name") == "enter" + assert attrs.get("gen_ai.framework") == "wildtool" + assert attrs.get("gen_ai.session.id") == "wild_tool_bench_test_001" + assert attrs.get("gen_ai.request.model") == "gpt-4o" + assert attrs.get("wildtool.turn_count") == 2 + # ENTRY spans rely on default OTel status semantics: success leaves + # the span UNSET, failures explicitly mark it ERROR. + assert span.status.status_code != StatusCode.ERROR + + def test_entry_span_error_path(self, span_exporter, instrument): + """The ENTRY wrapper marks the span ERROR when the wrapped callable + raises an unhandled exception. + + ``multi_threaded_inference`` swallows non-rate-limit errors itself + (see test_error_scenarios.test_entry_span_captures_retry_error_path + for that path). To exercise the wrapper's failure branch directly we + invoke the underlying ``WildToolEntryWrapper`` with a callable that + deliberately raises, bypassing ``multi_threaded_inference``'s own + error handling. + """ + from opentelemetry.instrumentation.wildtool._wrappers import ( + WildToolEntryWrapper, + ) + + wrapper = WildToolEntryWrapper(instrument._handler) + + def _raising(handler, model_name, test_case): + raise RuntimeError("API connection failed") + + handler = _StubHandler() + test_case = { + "id": "wild_tool_bench_test_002", + "english_tasks": ["task1"], + } + + with pytest.raises(RuntimeError, match="API connection failed"): + wrapper(_raising, None, (handler, "gpt-4o", test_case), {}) + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans if s.name == "enter_ai_application_system" + ] + assert len(entry_spans) == 1 + span = entry_spans[0] + assert span.status.status_code == StatusCode.ERROR diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_error_scenarios.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_error_scenarios.py new file mode 100644 index 000000000..c14a3f40c --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_error_scenarios.py @@ -0,0 +1,135 @@ +"""Tests for error/edge-case scenarios.""" + +import json + +import pytest +from opentelemetry.trace import StatusCode + +from wtb.model_handler.base_handler import BaseHandler + + +class _StubHandler(BaseHandler): + """Handler with controllable step responses.""" + + def __init__(self): + super().__init__("test-model", 0.0) + self._step_responses = [] + self._step_idx = 0 + + def _request_tool_call(self, inference_data): + resp = self._step_responses[self._step_idx] + self._step_idx += 1 + if isinstance(resp, Exception): + raise resp + return resp, 0.05 + + def _parse_api_response(self, api_response): + data = json.loads(api_response.json()) + choice = data["choices"][0] + message = choice["message"] + return { + "reasoning_content": None, + "content": message.get("content"), + "tool_calls": message.get("tool_calls"), + "input_token": data["usage"]["prompt_tokens"], + "output_token": data["usage"]["completion_tokens"], + } + + +class TestErrorScenarios: + def test_action_name_mismatch( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, + ): + """When model calls wrong tool, CHAIN span should still be OK with error label.""" + handler = _StubHandler() + # Model calls wrong_tool instead of get_weather + resp0 = tool_call_response_factory( + "wrong_tool", {"x": 1}, "call_bad" + ) + handler._step_responses = [resp0] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + chain_spans = [s for s in spans if s.name.startswith("workflow")] + assert len(chain_spans) == 1 + + chain = chain_spans[0] + attrs = dict(chain.attributes or {}) + assert attrs.get("wildtool.action_name_label") == "error" + assert chain.status.status_code == StatusCode.OK + + def test_empty_response( + self, span_exporter, instrument, simple_test_entry, + make_completion, + ): + """When model returns no content and no tool_calls, process terminates gracefully.""" + from tests.conftest import FakeChatCompletion, _make_chat_completion_response + + handler = _StubHandler() + resp = FakeChatCompletion( + _make_chat_completion_response(content="", tool_calls=None) + ) + handler._step_responses = [resp] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + chain_spans = [s for s in spans if s.name.startswith("workflow")] + assert len(chain_spans) == 1 + attrs = dict(chain_spans[0].attributes or {}) + assert attrs.get("wildtool.action_name_label") == "error" + + def test_request_tool_call_exception_sets_error( + self, span_exporter, instrument, simple_test_entry, + ): + """Exception in _request_tool_call should produce ERROR on STEP span and propagate.""" + handler = _StubHandler() + handler._step_responses = [RuntimeError("Connection timeout")] + + with pytest.raises(RuntimeError, match="Connection timeout"): + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + step_spans = [s for s in spans if s.name == "react step"] + assert len(step_spans) == 1 + assert step_spans[0].status.status_code == StatusCode.ERROR + + chain_spans = [s for s in spans if s.name.startswith("workflow")] + assert len(chain_spans) == 1 + assert chain_spans[0].status.status_code == StatusCode.ERROR + + def test_entry_span_captures_retry_error_path( + self, span_exporter, instrument, + ): + """multi_threaded_inference catches non-rate-limit errors and returns error dict. + ENTRY span should still complete successfully (not raise).""" + from wtb._llm_response_generation import multi_threaded_inference + + handler = _StubHandler() + + def failing_inference(test_entry): + raise ValueError("Invalid JSON from model") + + handler.inference = failing_inference + + test_case = { + "id": "wild_tool_bench_err_001", + "english_tasks": ["task1"], + } + + # multi_threaded_inference catches non-rate-limit errors + result = multi_threaded_inference(handler, "test-model", test_case) + assert "Error during inference" in result["result"] + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans if s.name == "enter_ai_application_system" + ] + assert len(entry_spans) == 1 + # multi_threaded_inference's own try/except converts the error into a + # normal return, so the ENTRY wrapper observes a successful call and + # leaves the span at the default UNSET status (definitely not ERROR). + span = entry_spans[0] + assert span.status.status_code != StatusCode.ERROR diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_instrumentor.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_instrumentor.py new file mode 100644 index 000000000..a8be5b4da --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_instrumentor.py @@ -0,0 +1,20 @@ +"""Tests for WildToolInstrumentor lifecycle.""" + +from opentelemetry.instrumentation.wildtool import WildToolInstrumentor + + +class TestWildToolInstrumentor: + def test_instrument_and_uninstrument(self, tracer_provider): + instrumentor = WildToolInstrumentor() + instrumentor.instrument( + tracer_provider=tracer_provider, + skip_dep_check=True, + ) + assert instrumentor._handler is not None + instrumentor.uninstrument() + assert instrumentor._handler is None + + def test_instrumentation_dependencies(self): + instrumentor = WildToolInstrumentor() + deps = instrumentor.instrumentation_dependencies() + assert ("openai >= 1.0.0",) == deps diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_round2_fixes.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_round2_fixes.py new file mode 100644 index 000000000..9f4f4d895 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_round2_fixes.py @@ -0,0 +1,441 @@ +"""Round 2 regression tests covering the H1 / H2 / M1 / M2 / M3 fixes. + +See ``llm-dev/execute.md`` § "修订记录 (Round 2 fix)" and +``example-deploy/validation/SUMMARY.md`` for the original validation gaps +addressed by these tests. +""" + +import json + +import pytest +from opentelemetry.trace import StatusCode + +from wtb.model_handler.base_handler import BaseHandler + + +class _StubHandler(BaseHandler): + """Minimal handler with controllable LLM responses (no real network).""" + + def __init__(self): + super().__init__("test-model", 0.0) + self._step_responses = [] + self._step_idx = 0 + + def _request_tool_call(self, inference_data): + resp = self._step_responses[self._step_idx] + self._step_idx += 1 + if isinstance(resp, Exception): + raise resp + return resp, 0.05 + + def _parse_api_response(self, api_response): + data = json.loads(api_response.json()) + choice = data["choices"][0] + message = choice["message"] + return { + "reasoning_content": None, + "content": message.get("content"), + "tool_calls": message.get("tool_calls"), + "input_token": data["usage"]["prompt_tokens"], + "output_token": data["usage"]["completion_tokens"], + } + + +def _spans_by_kind(spans, kind): + return [s for s in spans if (s.attributes or {}).get("gen_ai.span.kind") == kind] + + +def _spans_named(spans, name): + return [s for s in spans if s.name == name] + + +def _step_for_round(spans, round_num): + for s in _spans_named(spans, "react step"): + attrs = s.attributes or {} + if attrs.get("gen_ai.react.round") == round_num: + return s + raise AssertionError(f"no STEP span found for round={round_num}") + + +# ============================================================================ +# H1: TOOL span parent_span_id == STEP span_id (was CHAIN before fix) +# ============================================================================ + + +class TestToolParentIsStep: + def test_single_tool_parent_is_step_round_one( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """The single TOOL span in simple_test_entry should be a child of the + first STEP span (round=1), not the CHAIN span.""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + tool_spans = _spans_by_kind(spans, "TOOL") + assert len(tool_spans) == 1, [s.name for s in spans] + + tool = tool_spans[0] + step_round1 = _step_for_round(spans, 1) + chain = _spans_by_kind(spans, "CHAIN")[0] + + # H1 core assertion: parent is STEP, not CHAIN. + assert tool.parent is not None + assert tool.parent.span_id == step_round1.context.span_id, ( + "TOOL parent should be STEP round=1, got " + f"{tool.parent.span_id} (STEP={step_round1.context.span_id}, " + f"CHAIN={chain.context.span_id})" + ) + assert tool.parent.span_id != chain.context.span_id + + # And trace_id of course remains consistent. + assert tool.context.trace_id == step_round1.context.trace_id + + def test_multi_step_each_tool_parented_to_correct_step( + self, span_exporter, instrument, + tool_call_response_factory, text_response_factory, + ): + """multi-step scenario: 2 successful tool steps + 1 prepare_to_answer. + + Each TOOL span must be parented to the STEP span of its own round, + not to the CHAIN or to a different round's STEP. + """ + handler = _StubHandler() + # Test entry with 2 tool steps (search, lookup) then prepare_to_answer. + test_entry = { + "id": "wild_tool_bench_multi_001", + "english_env_info": "2025-01-01", + "english_tools": [ + { + "type": "function", + "function": { + "name": "search", + "description": "Search items", + "parameters": { + "type": "object", + "properties": {"q": {"type": "string"}}, + "required": ["q"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "lookup", + "description": "Look up details", + "parameters": { + "type": "object", + "properties": {"id": {"type": "string"}}, + "required": ["id"], + }, + }, + }, + ], + "english_tasks": ["Find and summarize item X"], + "english_answer_list": [ + [ + { + "action": {"name": "search", "arguments": {"q": "X"}}, + "observation": "found:item_42", + "dependency_list": [], + }, + { + "action": {"name": "lookup", "arguments": {"id": "item_42"}}, + "observation": "details:hello", + "dependency_list": [0], + }, + { + "action": {"name": "prepare_to_answer", "arguments": {}}, + "observation": "Item X is hello.", + "dependency_list": [1], + }, + ] + ], + } + + resp_step1 = tool_call_response_factory( + "search", {"q": "X"}, "call_search_1" + ) + resp_step2 = tool_call_response_factory( + "lookup", {"id": "item_42"}, "call_lookup_1" + ) + resp_step3 = text_response_factory("Item X is hello.") + handler._step_responses = [resp_step1, resp_step2, resp_step3] + + handler.inference_multi_turn(test_entry) + + spans = span_exporter.get_finished_spans() + tool_spans = sorted( + _spans_by_kind(spans, "TOOL"), + key=lambda s: (s.attributes or {}).get("gen_ai.tool.name") or "", + ) + assert len(tool_spans) == 2, [s.name for s in spans] + + step_round1 = _step_for_round(spans, 1) + step_round2 = _step_for_round(spans, 2) + chain = _spans_by_kind(spans, "CHAIN")[0] + + lookup_tool = next( + t for t in tool_spans + if (t.attributes or {}).get("gen_ai.tool.name") == "lookup" + ) + search_tool = next( + t for t in tool_spans + if (t.attributes or {}).get("gen_ai.tool.name") == "search" + ) + + # search → STEP round=1, lookup → STEP round=2 + assert search_tool.parent.span_id == step_round1.context.span_id + assert lookup_tool.parent.span_id == step_round2.context.span_id + # Neither parented on CHAIN (the regression we are fixing) + for t in tool_spans: + assert t.parent.span_id != chain.context.span_id + assert t.context.trace_id == chain.context.trace_id + + +# ============================================================================ +# M1: CHAIN span carries input.value and output.value +# ============================================================================ + + +class TestChainInputOutputValue: + def test_chain_input_value_and_output_value( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + chain_spans = _spans_by_kind(spans, "CHAIN") + assert len(chain_spans) == 1 + attrs = dict(chain_spans[0].attributes or {}) + + # input.value: last user message of the chain (prepared by wtb's + # _pre_messages_processing which appends the current task as user). + assert "input.value" in attrs, attrs + assert attrs["input.value"] == "What is the weather in Beijing?" + + # output.value: JSON containing action_name_label, task_idx, is_optimal. + assert "output.value" in attrs, attrs + out = json.loads(attrs["output.value"]) + assert out["action_name_label"] == "correct" + assert out["task_idx"] == 0 + assert out["is_optimal"] is True + + def test_chain_input_value_truncated_when_long( + self, span_exporter, instrument, + tool_call_response_factory, text_response_factory, + ): + """Very long user content should be truncated to keep span attribute small.""" + handler = _StubHandler() + long_text = "x" * 5000 + test_entry = { + "id": "wild_tool_bench_long_001", + "english_env_info": "2025-01-01", + "english_tools": [ + { + "type": "function", + "function": { + "name": "noop", + "description": "noop", + "parameters": {"type": "object", "properties": {}}, + }, + } + ], + "english_tasks": [long_text], + "english_answer_list": [ + [ + { + "action": {"name": "prepare_to_answer", "arguments": {}}, + "observation": "ok", + "dependency_list": [], + } + ] + ], + } + handler._step_responses = [text_response_factory("ok")] + + handler.inference_multi_turn(test_entry) + + spans = span_exporter.get_finished_spans() + chain = _spans_by_kind(spans, "CHAIN")[0] + attrs = dict(chain.attributes or {}) + assert "input.value" in attrs + # Default cap is 4096; truncated form must be <= cap + suffix length. + assert len(attrs["input.value"]) <= 4096 + len("...(truncated)") + assert attrs["input.value"].startswith("xxx") + + +# ============================================================================ +# M2: STEP span carries gen_ai.react.finish_reason on error paths +# ============================================================================ + + +class TestStepFinishReason: + def test_finish_reason_action_name_mismatch( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, + ): + handler = _StubHandler() + # wrong tool name → wtb's "action name not in candidate" branch + handler._step_responses = [ + tool_call_response_factory("wrong_tool", {"x": 1}, "call_bad") + ] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + steps = _spans_named(spans, "react step") + assert len(steps) == 1 + attrs = dict(steps[0].attributes or {}) + assert attrs.get("gen_ai.react.finish_reason") == "action_name_mismatch" + + def test_finish_reason_empty_response( + self, span_exporter, instrument, simple_test_entry, make_completion, + ): + """Empty content + no tool_calls → STEP gets finish_reason=empty_response.""" + from tests.conftest import ( + FakeChatCompletion, + _make_chat_completion_response, + ) + + handler = _StubHandler() + handler._step_responses = [ + FakeChatCompletion( + _make_chat_completion_response(content="", tool_calls=None) + ) + ] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + steps = _spans_named(spans, "react step") + assert len(steps) == 1 + attrs = dict(steps[0].attributes or {}) + assert attrs.get("gen_ai.react.finish_reason") == "empty_response" + + def test_finish_reason_request_exception( + self, span_exporter, instrument, simple_test_entry, + ): + """Exception in _request_tool_call → STEP ERROR + finish_reason=error.""" + handler = _StubHandler() + handler._step_responses = [RuntimeError("Boom")] + + with pytest.raises(RuntimeError): + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + steps = _spans_named(spans, "react step") + assert len(steps) == 1 + attrs = dict(steps[0].attributes or {}) + assert steps[0].status.status_code == StatusCode.ERROR + assert attrs.get("gen_ai.react.finish_reason") == "error" + + def test_finish_reason_omitted_on_success( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """Successful steps should NOT have a finish_reason (per execute.md).""" + handler = _StubHandler() + handler._step_responses = [ + tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ), + text_response_factory("OK"), + ] + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + for s in _spans_named(spans, "react step"): + attrs = dict(s.attributes or {}) + assert "gen_ai.react.finish_reason" not in attrs, ( + f"unexpected finish_reason on success step round=" + f"{attrs.get('gen_ai.react.round')}: {attrs.get('gen_ai.react.finish_reason')}" + ) + + +# ============================================================================ +# M3: TOOL span carries gen_ai.tool.call.arguments / result / description +# (and keeps wildtool.tool.execution_mode) +# ============================================================================ + + +class TestToolSensitiveAttributes: + def test_tool_args_result_description_and_execution_mode( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("Sunny day") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + tool_spans = _spans_by_kind(spans, "TOOL") + assert len(tool_spans) == 1 + attrs = dict(tool_spans[0].attributes or {}) + + # M3 explicit attrs. + args_attr = attrs.get("gen_ai.tool.call.arguments") + assert args_attr is not None + assert json.loads(args_attr) == {"city": "Beijing"} + + # observation comes from the appended {"role": "tool", ...} message + # written by wtb after the call matches the answer; it's a string. + result_attr = attrs.get("gen_ai.tool.call.result") + assert result_attr == "Sunny, 25°C", attrs + + # description sourced from inference_data["tools"][i].function.description + assert attrs.get("gen_ai.tool.description") == "Get weather for a city" + + # Existing custom attribute must still be present. + assert ( + attrs.get("wildtool.tool.execution_mode") + == "ground_truth_replay" + ) + + +# ============================================================================ +# H2: STEP span carries gen_ai.system / gen_ai.provider.name fallback +# ============================================================================ + + +class TestStepProviderFallback: + def test_step_has_provider_name_fallback( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + handler = _StubHandler() + handler._step_responses = [ + tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ), + text_response_factory("OK"), + ] + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + steps = _spans_named(spans, "react step") + assert len(steps) == 2 + for s in steps: + attrs = dict(s.attributes or {}) + assert attrs.get("gen_ai.system") == "openai", attrs + assert attrs.get("gen_ai.provider.name") == "openai", attrs