mrveiss · mrveiss · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026
@@ -64,7 +64,7 @@
     re.compile(r"^\d+$"),
     # Alphanumeric slugs that look generated: starts with alpha/digit, contains
     # both letters and digits, length ≥ 8.  Avoids collapsing short word slugs.
-    re.compile(r"^(?=.*[a-z])(?=.*\d)[a-z0-9_-]{8,}$", re.I),
+    re.compile(r"^(?=[a-z0-9_-]{0,200}[a-z])(?=[a-z0-9_-]{0,200}\d)[a-z0-9_-]{8,}$", re.I),
 ]
 
 

@@ -634,9 +634,23 @@ def _fallback_html_strip(html_content: str) -> tuple:
     """
     import re
     from html import unescape
+    from html.parser import HTMLParser
+    from io import StringIO
 
-    text = re.sub(r"<[^>]+>", " ", html_content)
-    text = unescape(text)
+    class _TagStripper(HTMLParser):
+        def __init__(self):
+            super().__init__()
+            self._parts: list[str] = []
+
+        def handle_data(self, data: str):
+            self._parts.append(data)
+
+        def get_text(self) -> str:
+            return " ".join(self._parts)
+
+    stripper = _TagStripper()
+    stripper.feed(html_content)
+    text = unescape(stripper.get_text())
     return re.sub(r"\s+", " ", text).strip(), ""
 
 

@@ -32,11 +32,11 @@
 FIELD_TYPES = {"string", "number", "date", "boolean", "list", "url", "email"}
 
 # Validation pattern for email
-EMAIL_PATTERN = re.compile(r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$")
+EMAIL_PATTERN = re.compile(r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)+$")
 
 # Validation pattern for URL
 URL_PATTERN = re.compile(
-    r"^https?://[a-zA-Z0-9][-a-zA-Z0-9]*(\.[a-zA-Z0-9][-a-zA-Z0-9]*)+.*$"
+    r"^https?://[a-zA-Z0-9][-a-zA-Z0-9]*(?:\.[a-zA-Z0-9][-a-zA-Z0-9]*)+(?::\d{1,5})?(?:[/?#]\S*)?$"
 )
 
 

@@ -183,7 +183,9 @@ def _compile_patterns(self):
         for pattern in self.config.config.get("domain_security", {}).get(
             "blacklist", []
         ):
-            regex_pattern = pattern.replace("*", ".*").replace(".", "\\.")
+            # Split on wildcards, escape each literal part, rejoin with .*
+            parts = pattern.split("*")
+            regex_pattern = ".*".join(re.escape(p) for p in parts)
             self.blacklist_patterns.append(
                 re.compile(f"^{regex_pattern}$", re.IGNORECASE)
             )
@@ -192,7 +194,8 @@ def _compile_patterns(self):
         for pattern in self.config.config.get("domain_security", {}).get(
             "whitelist", []
         ):
-            regex_pattern = pattern.replace("*", ".*").replace(".", "\\.")
+            parts = pattern.split("*")
+            regex_pattern = ".*".join(re.escape(p) for p in parts)
             self.whitelist_patterns.append(
                 re.compile(f"^{regex_pattern}$", re.IGNORECASE)
             )

@@ -51,10 +51,10 @@ class SemanticAnalyzer:
             "class_naming": re.compile(r"class [A-Z][a-zA-Z0-9]*:"),
         },
         "google": {
-            "docstring": re.compile(r'"""[\s\S]*Args:[\s\S]*Returns:'),
+            "docstring": re.compile(r'"""[\s\S]{0,5000}Args:[\s\S]{0,5000}Returns:'),
         },
         "numpy": {
-            "docstring": re.compile(r'"""[\s\S]*Parameters[\s\S]*----------'),
+            "docstring": re.compile(r'"""[\s\S]{0,5000}Parameters[\s\S]{0,5000}----------'),
         },
     }
 

@@ -21,8 +21,8 @@
 # Issue #380: Pre-compiled ANSI escape sequence patterns for strip_ansi_codes()
 # These patterns are used frequently in terminal output processing
 _ANSI_CSI_RE = re.compile(r"\x1b\[[0-9;]*[a-zA-Z]")  # CSI sequences
-_ANSI_OSC_BEL_RE = re.compile(r"\x1b\][0-9;]*[^\x07]*\x07")  # OSC with BEL
-_ANSI_OSC_ST_RE = re.compile(r"\x1b\][0-9;]*[^\x07\x1b]*(?:\x1b\\)?")  # OSC with ST
+_ANSI_OSC_BEL_RE = re.compile(r"\x1b\][^\x07]{0,1024}\x07")  # OSC with BEL
+_ANSI_OSC_ST_RE = re.compile(r"\x1b\][^\x1b]{0,1024}(?:\x1b\\)?")  # OSC with ST
 _ANSI_SET_MODE_RE = re.compile(r"\x1b[=>]")  # Set modes
 _ANSI_CHARSET_RE = re.compile(r"\x1b[()][AB012]")  # Character sets
 _ANSI_BRACKET_RE = re.compile(r"\[[\?\d;]*[hlHJ]")  # Bracket sequences