diff --git a/autobot-backend/api/analytics_controller.py b/autobot-backend/api/analytics_controller.py index 0f121b500..b21908a2b 100644 --- a/autobot-backend/api/analytics_controller.py +++ b/autobot-backend/api/analytics_controller.py @@ -64,7 +64,7 @@ re.compile(r"^\d+$"), # Alphanumeric slugs that look generated: starts with alpha/digit, contains # both letters and digits, length ≥ 8. Avoids collapsing short word slugs. - re.compile(r"^(?=.*[a-z])(?=.*\d)[a-z0-9_-]{8,}$", re.I), + re.compile(r"^(?=[a-z0-9_-]{0,200}[a-z])(?=[a-z0-9_-]{0,200}\d)[a-z0-9_-]{8,}$", re.I), ] diff --git a/autobot-backend/api/knowledge.py b/autobot-backend/api/knowledge.py index 2f5d3c588..1e85e8853 100644 --- a/autobot-backend/api/knowledge.py +++ b/autobot-backend/api/knowledge.py @@ -634,9 +634,23 @@ def _fallback_html_strip(html_content: str) -> tuple: """ import re from html import unescape + from html.parser import HTMLParser + from io import StringIO - text = re.sub(r"<[^>]+>", " ", html_content) - text = unescape(text) + class _TagStripper(HTMLParser): + def __init__(self): + super().__init__() + self._parts: list[str] = [] + + def handle_data(self, data: str): + self._parts.append(data) + + def get_text(self) -> str: + return " ".join(self._parts) + + stripper = _TagStripper() + stripper.feed(html_content) + text = unescape(stripper.get_text()) return re.sub(r"\s+", " ", text).strip(), "" diff --git a/autobot-backend/knowledge/metadata.py b/autobot-backend/knowledge/metadata.py index 1fc274bc7..47d634471 100644 --- a/autobot-backend/knowledge/metadata.py +++ b/autobot-backend/knowledge/metadata.py @@ -32,11 +32,11 @@ FIELD_TYPES = {"string", "number", "date", "boolean", "list", "url", "email"} # Validation pattern for email -EMAIL_PATTERN = re.compile(r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$") +EMAIL_PATTERN = re.compile(r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)+$") # Validation pattern for URL URL_PATTERN = re.compile( - r"^https?://[a-zA-Z0-9][-a-zA-Z0-9]*(\.[a-zA-Z0-9][-a-zA-Z0-9]*)+.*$" + r"^https?://[a-zA-Z0-9][-a-zA-Z0-9]*(?:\.[a-zA-Z0-9][-a-zA-Z0-9]*)+(?::\d{1,5})?(?:[/?#]\S*)?$" ) diff --git a/autobot-backend/security/domain_security.py b/autobot-backend/security/domain_security.py index 06538c867..44c79484a 100644 --- a/autobot-backend/security/domain_security.py +++ b/autobot-backend/security/domain_security.py @@ -183,7 +183,9 @@ def _compile_patterns(self): for pattern in self.config.config.get("domain_security", {}).get( "blacklist", [] ): - regex_pattern = pattern.replace("*", ".*").replace(".", "\\.") + # Split on wildcards, escape each literal part, rejoin with .* + parts = pattern.split("*") + regex_pattern = ".*".join(re.escape(p) for p in parts) self.blacklist_patterns.append( re.compile(f"^{regex_pattern}$", re.IGNORECASE) ) @@ -192,7 +194,8 @@ def _compile_patterns(self): for pattern in self.config.config.get("domain_security", {}).get( "whitelist", [] ): - regex_pattern = pattern.replace("*", ".*").replace(".", "\\.") + parts = pattern.split("*") + regex_pattern = ".*".join(re.escape(p) for p in parts) self.whitelist_patterns.append( re.compile(f"^{regex_pattern}$", re.IGNORECASE) ) diff --git a/autobot-backend/services/semantic_analyzer.py b/autobot-backend/services/semantic_analyzer.py index d8c5a02b3..793c289ba 100644 --- a/autobot-backend/services/semantic_analyzer.py +++ b/autobot-backend/services/semantic_analyzer.py @@ -51,10 +51,10 @@ class SemanticAnalyzer: "class_naming": re.compile(r"class [A-Z][a-zA-Z0-9]*:"), }, "google": { - "docstring": re.compile(r'"""[\s\S]*Args:[\s\S]*Returns:'), + "docstring": re.compile(r'"""[\s\S]{0,5000}Args:[\s\S]{0,5000}Returns:'), }, "numpy": { - "docstring": re.compile(r'"""[\s\S]*Parameters[\s\S]*----------'), + "docstring": re.compile(r'"""[\s\S]{0,5000}Parameters[\s\S]{0,5000}----------'), }, } diff --git a/autobot-backend/utils/encoding_utils.py b/autobot-backend/utils/encoding_utils.py index cf4cfea6b..703fd9405 100644 --- a/autobot-backend/utils/encoding_utils.py +++ b/autobot-backend/utils/encoding_utils.py @@ -21,8 +21,8 @@ # Issue #380: Pre-compiled ANSI escape sequence patterns for strip_ansi_codes() # These patterns are used frequently in terminal output processing _ANSI_CSI_RE = re.compile(r"\x1b\[[0-9;]*[a-zA-Z]") # CSI sequences -_ANSI_OSC_BEL_RE = re.compile(r"\x1b\][0-9;]*[^\x07]*\x07") # OSC with BEL -_ANSI_OSC_ST_RE = re.compile(r"\x1b\][0-9;]*[^\x07\x1b]*(?:\x1b\\)?") # OSC with ST +_ANSI_OSC_BEL_RE = re.compile(r"\x1b\][^\x07]{0,1024}\x07") # OSC with BEL +_ANSI_OSC_ST_RE = re.compile(r"\x1b\][^\x1b]{0,1024}(?:\x1b\\)?") # OSC with ST _ANSI_SET_MODE_RE = re.compile(r"\x1b[=>]") # Set modes _ANSI_CHARSET_RE = re.compile(r"\x1b[()][AB012]") # Character sets _ANSI_BRACKET_RE = re.compile(r"\[[\?\d;]*[hlHJ]") # Bracket sequences