|
| 1 | +""" |
| 2 | +Rule Extractor Agent: LLM-powered extraction of rule-like statements from markdown. |
| 3 | +""" |
| 4 | + |
| 5 | +import logging |
| 6 | +import re |
| 7 | +import time |
| 8 | +from typing import Any |
| 9 | + |
| 10 | +from langgraph.graph import END, START, StateGraph |
| 11 | +from openai import APIConnectionError |
| 12 | +from pydantic import BaseModel, Field |
| 13 | + |
| 14 | +from src.agents.base import AgentResult, BaseAgent |
| 15 | +from src.agents.extractor_agent.models import ExtractorOutput |
| 16 | +from src.agents.extractor_agent.prompts import EXTRACTOR_PROMPT |
| 17 | + |
| 18 | +logger = logging.getLogger(__name__) |
| 19 | + |
| 20 | +# Max length/byte cap for markdown input to reduce prompt-injection and token cost |
| 21 | +MAX_EXTRACTOR_INPUT_LENGTH = 16_000 |
| 22 | + |
| 23 | +# Patterns to redact (replaced with [REDACTED]) before sending to LLM. |
| 24 | +# (?i) in the pattern makes the match case-insensitive; do not pass re.IGNORECASE. |
| 25 | +_REDACT_PATTERNS = [ |
| 26 | + (re.compile(r"(?i)api[_-]?key\s*[:=]\s*['\"]?[\w\-]{20,}['\"]?"), "[REDACTED]"), |
| 27 | + (re.compile(r"(?i)token\s*[:=]\s*['\"]?[\w\-\.]{20,}['\"]?"), "[REDACTED]"), |
| 28 | + (re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"), "[REDACTED]"), |
| 29 | + (re.compile(r"(?i)bearer\s+[\w\-\.]+"), "Bearer [REDACTED]"), |
| 30 | +] |
| 31 | + |
| 32 | + |
| 33 | +def redact_and_cap(text: str, max_length: int = MAX_EXTRACTOR_INPUT_LENGTH) -> str: |
| 34 | + """Sanitize and cap input: redact secret/PII-like patterns and enforce max length.""" |
| 35 | + if not text or not isinstance(text, str): |
| 36 | + return "" |
| 37 | + out = text.strip() |
| 38 | + for pattern, replacement in _REDACT_PATTERNS: |
| 39 | + out = pattern.sub(replacement, out) |
| 40 | + if len(out) > max_length: |
| 41 | + out = out[:max_length].rstrip() + "\n\n[truncated]" |
| 42 | + return out |
| 43 | + |
| 44 | + |
| 45 | +class ExtractorState(BaseModel): |
| 46 | + """State for the extractor (single-node) graph.""" |
| 47 | + |
| 48 | + markdown_content: str = "" |
| 49 | + statements: list[str] = Field(default_factory=list) |
| 50 | + decision: str = "" |
| 51 | + confidence: float = 1.0 |
| 52 | + reasoning: str = "" |
| 53 | + recommendations: list[str] = Field(default_factory=list) |
| 54 | + strategy_used: str = "" |
| 55 | + |
| 56 | + |
| 57 | +class RuleExtractorAgent(BaseAgent): |
| 58 | + """ |
| 59 | + Extractor Agent: reads raw markdown and returns a structured list of rule-like statements. |
| 60 | + Single-node LangGraph: extract -> END. Uses LLM with structured output. |
| 61 | + """ |
| 62 | + |
| 63 | + def __init__(self, max_retries: int = 3, timeout: float = 30.0): |
| 64 | + super().__init__(max_retries=max_retries, agent_name="extractor_agent") |
| 65 | + self.timeout = timeout |
| 66 | + logger.info("🔧 RuleExtractorAgent initialized with max_retries=%s, timeout=%ss", max_retries, timeout) |
| 67 | + |
| 68 | + def _build_graph(self): |
| 69 | + """Single node: run LLM extraction and set state.statements.""" |
| 70 | + workflow = StateGraph(ExtractorState) |
| 71 | + |
| 72 | + async def extract_node(state: ExtractorState) -> dict: |
| 73 | + raw = (state.markdown_content or "").strip() |
| 74 | + if not raw: |
| 75 | + return { |
| 76 | + "statements": [], |
| 77 | + "decision": "none", |
| 78 | + "confidence": 0.0, |
| 79 | + "reasoning": "Empty input", |
| 80 | + "recommendations": [], |
| 81 | + "strategy_used": "", |
| 82 | + } |
| 83 | + # Centralized sanitization (see execute(): defense-in-depth with redact_and_cap at entry). |
| 84 | + content = redact_and_cap(raw) |
| 85 | + if not content: |
| 86 | + return { |
| 87 | + "statements": [], |
| 88 | + "decision": "none", |
| 89 | + "confidence": 0.0, |
| 90 | + "reasoning": "Empty after sanitization", |
| 91 | + "recommendations": [], |
| 92 | + "strategy_used": "", |
| 93 | + } |
| 94 | + prompt = EXTRACTOR_PROMPT.format(markdown_content=content) |
| 95 | + structured_llm = self.llm.with_structured_output(ExtractorOutput) |
| 96 | + result = await structured_llm.ainvoke(prompt) |
| 97 | + return { |
| 98 | + "statements": result.statements, |
| 99 | + "decision": result.decision or "extracted", |
| 100 | + "confidence": result.confidence, |
| 101 | + "reasoning": result.reasoning or "", |
| 102 | + "recommendations": result.recommendations or [], |
| 103 | + "strategy_used": result.strategy_used or "", |
| 104 | + } |
| 105 | + |
| 106 | + workflow.add_node("extract", extract_node) |
| 107 | + workflow.add_edge(START, "extract") |
| 108 | + workflow.add_edge("extract", END) |
| 109 | + return workflow.compile() |
| 110 | + |
| 111 | + async def execute(self, **kwargs: Any) -> AgentResult: |
| 112 | + """Extract rule statements from markdown. Expects markdown_content=... in kwargs.""" |
| 113 | + markdown_content = kwargs.get("markdown_content") or kwargs.get("content") or "" |
| 114 | + if not isinstance(markdown_content, str): |
| 115 | + markdown_content = str(markdown_content or "") |
| 116 | + |
| 117 | + start_time = time.time() |
| 118 | + |
| 119 | + if not markdown_content.strip(): |
| 120 | + return AgentResult( |
| 121 | + success=True, |
| 122 | + message="Empty content", |
| 123 | + data={ |
| 124 | + "statements": [], |
| 125 | + "decision": "none", |
| 126 | + "confidence": 0.0, |
| 127 | + "reasoning": "Empty content", |
| 128 | + "recommendations": [], |
| 129 | + "strategy_used": "", |
| 130 | + }, |
| 131 | + metadata={"execution_time_ms": 0}, |
| 132 | + ) |
| 133 | + |
| 134 | + try: |
| 135 | + # Defense-in-depth: redact_and_cap at entry and again in extract_node. |
| 136 | + # Keeps ExtractorState safe and ensures node always sees sanitized input. |
| 137 | + sanitized = redact_and_cap(markdown_content) |
| 138 | + logger.info("🚀 Extractor agent processing markdown (%s chars)", len(sanitized)) |
| 139 | + initial_state = ExtractorState(markdown_content=sanitized) |
| 140 | + result = await self._execute_with_timeout( |
| 141 | + self.graph.ainvoke(initial_state), |
| 142 | + timeout=self.timeout, |
| 143 | + ) |
| 144 | + execution_time = time.time() - start_time |
| 145 | + meta_base = {"execution_time_ms": execution_time * 1000} |
| 146 | + |
| 147 | + if isinstance(result, dict): |
| 148 | + statements = result.get("statements", []) |
| 149 | + decision = result.get("decision", "extracted") |
| 150 | + confidence = float(result.get("confidence", 1.0)) |
| 151 | + reasoning = result.get("reasoning", "") |
| 152 | + recommendations = result.get("recommendations", []) or [] |
| 153 | + strategy_used = result.get("strategy_used", "") |
| 154 | + elif hasattr(result, "statements"): |
| 155 | + statements = result.statements |
| 156 | + decision = getattr(result, "decision", "extracted") |
| 157 | + confidence = float(getattr(result, "confidence", 1.0)) |
| 158 | + reasoning = getattr(result, "reasoning", "") or "" |
| 159 | + recommendations = getattr(result, "recommendations", []) or [] |
| 160 | + strategy_used = getattr(result, "strategy_used", "") or "" |
| 161 | + else: |
| 162 | + statements = [] |
| 163 | + decision = "none" |
| 164 | + confidence = 0.0 |
| 165 | + reasoning = "" |
| 166 | + recommendations = [] |
| 167 | + strategy_used = "" |
| 168 | + |
| 169 | + payload = { |
| 170 | + "statements": statements, |
| 171 | + "decision": decision, |
| 172 | + "confidence": confidence, |
| 173 | + "reasoning": reasoning, |
| 174 | + "recommendations": recommendations, |
| 175 | + "strategy_used": strategy_used, |
| 176 | + } |
| 177 | + |
| 178 | + if confidence < 0.5: |
| 179 | + logger.info( |
| 180 | + "Extractor confidence below threshold (%.2f); routing to human review", |
| 181 | + confidence, |
| 182 | + ) |
| 183 | + return AgentResult( |
| 184 | + success=False, |
| 185 | + message="Low confidence; routed to human review", |
| 186 | + data=payload, |
| 187 | + metadata={**meta_base, "routing": "human_review"}, |
| 188 | + ) |
| 189 | + logger.info( |
| 190 | + "✅ Extractor agent completed in %.2fs; extracted %s statements (confidence=%.2f)", |
| 191 | + execution_time, |
| 192 | + len(statements), |
| 193 | + confidence, |
| 194 | + ) |
| 195 | + return AgentResult( |
| 196 | + success=True, |
| 197 | + message="OK", |
| 198 | + data=payload, |
| 199 | + metadata={**meta_base}, |
| 200 | + ) |
| 201 | + except TimeoutError: |
| 202 | + execution_time = time.time() - start_time |
| 203 | + logger.error("❌ Extractor agent timed out after %.2fs", execution_time) |
| 204 | + return AgentResult( |
| 205 | + success=False, |
| 206 | + message=f"Extractor timed out after {self.timeout}s", |
| 207 | + data={ |
| 208 | + "statements": [], |
| 209 | + "decision": "none", |
| 210 | + "confidence": 0.0, |
| 211 | + "reasoning": "Timeout", |
| 212 | + "recommendations": [], |
| 213 | + "strategy_used": "", |
| 214 | + }, |
| 215 | + metadata={ |
| 216 | + "execution_time_ms": execution_time * 1000, |
| 217 | + "error_type": "timeout", |
| 218 | + "routing": "human_review", |
| 219 | + }, |
| 220 | + ) |
| 221 | + except APIConnectionError as e: |
| 222 | + execution_time = time.time() - start_time |
| 223 | + logger.warning( |
| 224 | + "Extractor agent API connection failed (network/unreachable): %s", |
| 225 | + e, |
| 226 | + exc_info=False, |
| 227 | + ) |
| 228 | + return AgentResult( |
| 229 | + success=False, |
| 230 | + message="LLM API connection failed; check network and API availability.", |
| 231 | + data={ |
| 232 | + "statements": [], |
| 233 | + "decision": "none", |
| 234 | + "confidence": 0.0, |
| 235 | + "reasoning": str(e)[:500], |
| 236 | + "recommendations": [], |
| 237 | + "strategy_used": "", |
| 238 | + }, |
| 239 | + metadata={ |
| 240 | + "execution_time_ms": execution_time * 1000, |
| 241 | + "error_type": "api_connection", |
| 242 | + "routing": "human_review", |
| 243 | + }, |
| 244 | + ) |
| 245 | + except Exception as e: |
| 246 | + execution_time = time.time() - start_time |
| 247 | + logger.exception("❌ Extractor agent failed: %s", e) |
| 248 | + return AgentResult( |
| 249 | + success=False, |
| 250 | + message=str(e), |
| 251 | + data={ |
| 252 | + "statements": [], |
| 253 | + "decision": "none", |
| 254 | + "confidence": 0.0, |
| 255 | + "reasoning": str(e)[:500], |
| 256 | + "recommendations": [], |
| 257 | + "strategy_used": "", |
| 258 | + }, |
| 259 | + metadata={ |
| 260 | + "execution_time_ms": execution_time * 1000, |
| 261 | + "error_type": type(e).__name__, |
| 262 | + "routing": "human_review", |
| 263 | + }, |
| 264 | + ) |
0 commit comments