move text massaging to core file

shellmayr · shellmayr · commit 3cbaad63d715 · 2026-03-05T12:56:17.000+01:00
diff --git a/sentry_sdk/ai/span_config.py b/sentry_sdk/ai/span_config.py
@@ -33,27 +33,31 @@ def set_request_span_data(span, kwargs, integration, config, span_data=None):
             set_data_normalized(span, span_attr, value)
 
     if should_send_default_pii() and integration.include_prompts:
-        extract = config.get("extract_messages")
-        if extract is not None:
-            messages = extract(kwargs)
-            if messages:
-                messages = normalize_message_roles(messages)
-                scope = sentry_sdk.get_current_scope()
-                messages = truncate_and_annotate_messages(messages, span, scope)
-                if messages is not None:
-                    target = config.get(
-                        "message_target", SPANDATA.GEN_AI_REQUEST_MESSAGES
-                    )
-                    set_data_normalized(span, target, messages, unpack=False)
-
         for kwarg_key, span_attr in config.get("pii_params", {}).items():
             if kwarg_key in kwargs:
                 value = kwargs[kwarg_key]
                 set_data_normalized(span, span_attr, value)
 
 
+def set_request_messages(span, messages, target=None):
+    # type: (Span, Any, Optional[str]) -> None
+    """Normalize, truncate, and set request messages on the span.
+
+    Caller is responsible for PII gating.
+    """
+    if not messages:
+        return
+    messages = normalize_message_roles(messages)
+    scope = sentry_sdk.get_current_scope()
+    messages = truncate_and_annotate_messages(messages, span, scope)
+    if messages is not None:
+        set_data_normalized(
+            span, target or SPANDATA.GEN_AI_REQUEST_MESSAGES, messages, unpack=False
+        )
+
+
 def set_response_span_data(
-    span, response, include_pii, response_config, collected_text=None
+    span, response, include_pii, response_config, response_text=None
 ):
     # type: (Span, Any, bool, Dict[str, Any], Optional[List[str]]) -> None
     """Set response span data from a declarative config."""
@@ -65,16 +69,8 @@ def set_response_span_data(
         pii_sources = response_config.get("pii_sources")
         if pii_sources:
             set_span_data_from_sources(span, response, pii_sources, require_truthy=True)
-        if collected_text:
-            set_data_normalized(
-                span, SPANDATA.GEN_AI_RESPONSE_TEXT, ["".join(collected_text)]
-            )
-        else:
-            extract_text = response_config.get("extract_text")
-            if extract_text:
-                texts = extract_text(response)
-                if texts:
-                    set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, texts)
+        if response_text:
+            set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, response_text)
 
     usage_config = response_config.get("usage")
     if usage_config:
diff --git a/sentry_sdk/integrations/cohere/__init__.py b/sentry_sdk/integrations/cohere/__init__.py
@@ -1,12 +1,17 @@
 import sys
 from functools import wraps
 
-from sentry_sdk.consts import OP
-from sentry_sdk.ai.span_config import set_request_span_data, set_response_span_data
+from sentry_sdk.consts import OP, SPANDATA
+from sentry_sdk.ai.span_config import (
+    set_request_span_data,
+    set_request_messages,
+    set_response_span_data,
+)
 from sentry_sdk.integrations.cohere.configs import COHERE_EMBED_CONFIG
 
 from typing import TYPE_CHECKING
 
+from sentry_sdk.scope import should_send_default_pii
 from sentry_sdk.tracing_utils import set_span_errored
 
 if TYPE_CHECKING:
@@ -64,24 +69,41 @@ def new_embed(*args, **kwargs):
 
         model = kwargs.get("model", "")
 
+        include_pii = should_send_default_pii() and integration.include_prompts
+
         with sentry_sdk.start_span(
             op=OP.GEN_AI_EMBEDDINGS,
             name=f"embeddings {model}".strip(),
             origin=CohereIntegration.origin,
         ) as span:
             set_request_span_data(span, kwargs, integration, COHERE_EMBED_CONFIG)
+            if include_pii and "texts" in kwargs:
+                set_request_messages(
+                    span,
+                    _normalize_embedding_input(kwargs["texts"]),
+                    target=SPANDATA.GEN_AI_EMBEDDINGS_INPUT,
+                )
 
             try:
-                res = f(*args, **kwargs)
+                response = f(*args, **kwargs)
             except Exception as e:
                 exc_info = sys.exc_info()
                 with capture_internal_exceptions():
                     _capture_exception(e)
                 reraise(*exc_info)
 
             set_response_span_data(
-                span, res, False, COHERE_EMBED_CONFIG["response"]
+                span, response, False, COHERE_EMBED_CONFIG["response"]
             )
-            return res
+            return response
 
     return new_embed
+
+
+def _normalize_embedding_input(texts):
+    # type: (Any) -> Any
+    if isinstance(texts, list):
+        return texts
+    if isinstance(texts, tuple):
+        return list(texts)
+    return [texts]
diff --git a/sentry_sdk/integrations/cohere/configs.py b/sentry_sdk/integrations/cohere/configs.py
@@ -1,13 +1,9 @@
-from sentry_sdk.ai.utils import (
-    get_first_from_sources,
-    transform_message_content,
-)
 from sentry_sdk.consts import SPANDATA
 
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple
+    from typing import Any, Dict, Sequence, Tuple
     from typing_extensions import TypedDict
 
     # Source paths: list of attribute chains to try in order.
@@ -32,9 +28,6 @@ class ResponseConfig(TypedDict, total=False):
         sources: SourceMapping
         # Attributes extracted only when PII sending is enabled.
         pii_sources: SourceMapping
-        # Custom extractor for response text (PII only).
-        # Returns list of text strings, or None.
-        extract_text: Callable[[Any], Optional[List[str]]]
         # Declarative token usage paths.
         usage: UsageConfig
 
@@ -47,10 +40,6 @@ class OperationConfig(TypedDict, total=False):
         params: Dict[str, str]
         # Maps kwarg names to SPANDATA keys (only set when PII is enabled).
         pii_params: Dict[str, str]
-        # Extracts messages from kwargs for the span.
-        extract_messages: Callable[[Dict[str, Any]], Optional[List[Dict[str, Any]]]]
-        # SPANDATA key for messages (default: GEN_AI_REQUEST_MESSAGES).
-        message_target: str
         # Non-streaming response config.
         response: ResponseConfig
         # Streaming response config (different attribute paths).
@@ -60,62 +49,6 @@ class OperationConfig(TypedDict, total=False):
         stream_response_object: SourcePaths
 
 
-# ── Helpers ──────────────────────────────────────────────────────────────────
-
-
-def _normalize_embedding_input(texts):
-    # type: (Any) -> Any
-    if isinstance(texts, list):
-        return texts
-    if isinstance(texts, tuple):
-        return list(texts)
-    return [texts]
-
-
-def _extract_v1_messages(kwargs):
-    # type: (dict[str, Any]) -> list[dict[str, str]]
-    messages = []
-    for x in kwargs.get("chat_history", []):
-        messages.append(
-            {
-                "role": getattr(x, "role", ""),
-                "content": transform_message_content(getattr(x, "message", "")),
-            }
-        )
-    message = kwargs.get("message")
-    if message:
-        messages.append({"role": "user", "content": transform_message_content(message)})
-    return messages
-
-
-def _extract_v1_response_text(response):
-    # type: (Any) -> list[str] | None
-    text = getattr(response, "text", None)
-    return [text] if text is not None else None
-
-
-def _extract_v2_messages(messages):
-    # type: (Any) -> list[dict[str, Any]]
-    result = []
-    for msg in messages:
-        role = msg["role"] if isinstance(msg, dict) else getattr(msg, "role", "unknown")
-        content = (
-            msg["content"] if isinstance(msg, dict) else getattr(msg, "content", "")
-        )
-        result.append({"role": role, "content": transform_message_content(content)})
-    return result
-
-
-def _extract_v2_response_text(response):
-    # type: (Any) -> list[str] | None
-    content = get_first_from_sources(response, [("message", "content")], True)
-    if content:
-        texts = [item.text for item in content if hasattr(item, "text")]
-        if texts:
-            return texts
-    return None
-
-
 # ── Configs ──────────────────────────────────────────────────────────────────
 
 
@@ -125,10 +58,6 @@ def _extract_v2_response_text(response):
         SPANDATA.GEN_AI_OPERATION_NAME: "embeddings",
     },
     "params": {"model": SPANDATA.GEN_AI_REQUEST_MODEL},
-    "extract_messages": lambda kw: (
-        _normalize_embedding_input(kw["texts"]) if "texts" in kw else None
-    ),
-    "message_target": SPANDATA.GEN_AI_EMBEDDINGS_INPUT,
     "response": {
         "usage": {
             "input_tokens": [("meta", "billed_units", "input_tokens")],
@@ -143,7 +72,6 @@ def _extract_v2_response_text(response):
         SPANDATA.GEN_AI_SYSTEM: "cohere",
         SPANDATA.GEN_AI_OPERATION_NAME: "chat",
     },
-    "extract_messages": lambda kw: _extract_v1_messages(kw),
     "response": {
         "sources": {
             SPANDATA.GEN_AI_RESPONSE_MODEL: [("model",)],
@@ -153,7 +81,6 @@ def _extract_v2_response_text(response):
         "pii_sources": {
             SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS: [("tool_calls",)],
         },
-        "extract_text": _extract_v1_response_text,
         "usage": {
             "input_tokens": [
                 ("meta", "billed_units", "input_tokens"),
@@ -180,7 +107,6 @@ def _extract_v2_response_text(response):
     "pii_params": {
         "tools": SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS,
     },
-    "extract_messages": lambda kw: _extract_v2_messages(kw.get("messages", [])),
     "response": {
         "sources": {
             SPANDATA.GEN_AI_RESPONSE_MODEL: [("model",)],
@@ -190,7 +116,6 @@ def _extract_v2_response_text(response):
         "pii_sources": {
             SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS: [("message", "tool_calls")],
         },
-        "extract_text": _extract_v2_response_text,
         "usage": {
             "input_tokens": [
                 ("usage", "billed_units", "input_tokens"),
diff --git a/sentry_sdk/integrations/cohere/v1.py b/sentry_sdk/integrations/cohere/v1.py
@@ -1,8 +1,12 @@
 import sys
 from functools import wraps
 
-from sentry_sdk.ai.span_config import set_request_span_data, set_response_span_data
-from sentry_sdk.ai.utils import get_first_from_sources
+from sentry_sdk.ai.span_config import (
+    set_request_span_data,
+    set_request_messages,
+    set_response_span_data,
+)
+from sentry_sdk.ai.utils import get_first_from_sources, transform_message_content
 from sentry_sdk.consts import OP, SPANDATA
 from sentry_sdk.integrations.cohere.configs import COHERE_V1_CHAT_CONFIG
 
@@ -88,11 +92,20 @@ def new_chat(*args, **kwargs):
                 set_request_span_data(
                     span, kwargs, integration, COHERE_V1_CHAT_CONFIG, span_data
                 )
+                if include_pii:
+                    set_request_messages(span, _extract_v1_messages(kwargs))
 
                 if streaming:
                     return _iter_stream_events(response, span, include_pii)
+                response_text = (
+                    _extract_v1_response_text(response) if include_pii else None
+                )
                 set_response_span_data(
-                    span, response, include_pii, COHERE_V1_CHAT_CONFIG["response"]
+                    span,
+                    response,
+                    include_pii,
+                    COHERE_V1_CHAT_CONFIG["response"],
+                    response_text,
                 )
                 return response
 
@@ -110,7 +123,36 @@ def _iter_stream_events(old_iterator, span, include_pii):
                     x, COHERE_V1_CHAT_CONFIG["stream_response_object"]
                 )
                 if response is not None:
+                    response_text = (
+                        _extract_v1_response_text(response) if include_pii else None
+                    )
                     set_response_span_data(
-                        span, response, include_pii, COHERE_V1_CHAT_CONFIG["response"]
+                        span,
+                        response,
+                        include_pii,
+                        COHERE_V1_CHAT_CONFIG["response"],
+                        response_text,
                     )
         yield x
+
+
+def _extract_v1_messages(kwargs):
+    # type: (Any) -> list[dict[str, str]]
+    messages = []
+    for x in kwargs.get("chat_history", []):
+        messages.append(
+            {
+                "role": getattr(x, "role", ""),
+                "content": transform_message_content(getattr(x, "message", "")),
+            }
+        )
+    message = kwargs.get("message")
+    if message:
+        messages.append({"role": "user", "content": transform_message_content(message)})
+    return messages
+
+
+def _extract_v1_response_text(response):
+    # type: (Any) -> list[str] | None
+    text = getattr(response, "text", None)
+    return [text] if text is not None else None
diff --git a/sentry_sdk/integrations/cohere/v2.py b/sentry_sdk/integrations/cohere/v2.py