From 5d417bb8cc9edea11e65298900ffa7308560573c Mon Sep 17 00:00:00 2001
From: maskedsyntax <aftaab@aftaab.xyz>
Date: Thu, 19 Feb 2026 21:46:44 +0530
Subject: [PATCH 1/2] refactor: deduplicate report renderers, optimize text
 summarization, update README

- Extract BaseReport class with shared ALERT_TYPE_MAPPING and
  _group_alerts_by_type() (removed ~100 lines of copy-paste across 3 files)
- Optimize _summarize_text() with single-pass Counter-based analysis
  (eliminates double unicodedata.category() calls and pd.Series copies)
- Extract duplicated leakage quick_fix strings to module constants
- Update README: stable release note, CI badge, complete checks list (25 checks)
---
 README.md                       | 27 +++++++++++----
 hashprep/checks/leakage.py      | 28 ++++++++--------
 hashprep/reports/base.py        | 42 +++++++++++++++++++++++
 hashprep/reports/html.py        | 43 ++----------------------
 hashprep/reports/markdown.py    | 42 ++---------------------
 hashprep/reports/pdf.py         | 40 ++--------------------
 hashprep/summaries/variables.py | 59 +++++++++++++++++----------------
 7 files changed, 112 insertions(+), 169 deletions(-)
 create mode 100644 hashprep/reports/base.py
diff --git a/README.md b/README.md
index 0e26d77..694026a 100644
--- a/README.md
+++ b/README.md
@@ -11,10 +11,11 @@
   <p align="center">
     <!-- Distribution -->
     <img src="https://img.shields.io/pypi/v/hashprep?color=blue&label=PyPI" />
-    <!-- <img src="https://img.shields.io/badge/PyPI-Coming%20Soon-blue" /> -->
     <!-- License -->
     <img src="https://img.shields.io/badge/License-MIT-green" />
     <img src="https://img.shields.io/badge/CLI-Supported-orange" />
+    <!-- CI -->
+    <a href="https://github.com/cachevector/hashprep/actions/workflows/ci.yml"><img src="https://github.com/cachevector/hashprep/actions/workflows/ci.yml/badge.svg" /></a>
   </p>
   <p>
     <!-- Features -->
@@ -26,7 +27,7 @@
 </div>
 
 > [!NOTE]
-> HashPrep is now in **beta** (v0.1.0b1). Core features are stable and tested, but the API may still evolve based on community feedback. Ready for testing in real-world ML workflows.
+> HashPrep v0.1.0 is now stable. Core features are fully tested with CI. The API may still evolve based on community feedback.
 
 ## Overview
 
@@ -165,19 +166,31 @@ hashprep version
 ```
 
 #### Available Checks
-- `outliers` - Detect outliers using IQR method
+- `outliers` - Detect outliers using z-score
 - `duplicates` - Find duplicate rows
-- `high_missing_values` - Columns with >50% missing data
+- `high_missing_values` - Columns with high missing data
+- `empty_columns` - Completely empty columns
 - `dataset_missingness` - Overall missing data patterns
+- `missing_patterns` - Correlated missing value patterns
 - `high_cardinality` - Categorical columns with too many unique values
 - `single_value_columns` - Constant columns with no variance
+- `mixed_data_types` - Columns with mixed data types
 - `class_imbalance` - Imbalanced target variable (requires --target)
-- `feature_correlation` - Highly correlated features
-- `target_leakage` - Features that may leak target information
+- `feature_correlation` - Highly correlated numeric features
+- `categorical_correlation` - Highly associated categorical features
+- `mixed_correlation` - Numeric-categorical associations
+- `data_leakage` - Columns identical to target
+- `target_leakage_patterns` - Features that may leak target information
 - `dataset_drift` - Distribution drift between datasets (requires --comparison)
 - `uniform_distribution` - Uniformly distributed numeric columns
 - `unique_values` - Columns where >95% values are unique
-- `many_zeros` - Columns with excessive zero values
+- `high_zero_counts` - Columns with excessive zero values
+- `skewness` - Highly skewed numeric distributions
+- `infinite_values` - Columns containing infinite values
+- `constant_length` - String columns with constant character length
+- `extreme_text_lengths` - Text columns with extreme value lengths
+- `datetime_skew` - Datetime columns concentrated in one period
+- `empty_dataset` - Empty or all-missing datasets
 
 ---
 
diff --git a/hashprep/checks/leakage.py b/hashprep/checks/leakage.py
index cdfa429..c98e650 100644
--- a/hashprep/checks/leakage.py
+++ b/hashprep/checks/leakage.py
@@ -9,6 +9,16 @@
 _LEAK = DEFAULT_CONFIG.leakage
 _log = get_logger("checks.leakage")
 
+_LEAKAGE_CRITICAL_FIX = (
+    "Options: \n- Drop column: Prevents target leakage (Pros: Ensures model integrity; Cons: Loses feature info)."
+    "\n- Verify feature: Check if correlation is valid or data-derived (Pros: Validates data; Cons: Time-consuming)."
+)
+_LEAKAGE_WARNING_FIX = (
+    "Options: \n- Drop column: Reduces leakage risk (Pros: Safer model; Cons: May lose predictive info)."
+    "\n- Retain and test: Use robust models (e.g., trees) and evaluate (Pros: Keeps potential signal; Cons: Risk of overfitting)."
+    "\n- Engineer feature: Transform to reduce correlation (Pros: Retains info; Cons: Adds complexity)."
+)
+
 
 def _check_data_leakage(analyzer):
     issues = []
@@ -52,11 +62,7 @@ def _check_target_leakage_patterns(analyzer):
                     )
                     if severity:
                         impact = "high" if severity == "critical" else "medium"
-                        quick_fix = (
-                            "Options: \n- Drop column: Prevents target leakage (Pros: Ensures model integrity; Cons: Loses feature info).\n- Verify feature: Check if correlation is valid or data-derived (Pros: Validates data; Cons: Time-consuming)."
-                            if severity == "critical"
-                            else "Options: \n- Drop column: Reduces leakage risk (Pros: Safer model; Cons: May lose predictive info).\n- Retain and test: Use robust models (e.g., trees) and evaluate (Pros: Keeps potential signal; Cons: Risk of overfitting).\n- Engineer feature: Transform to reduce correlation (Pros: Retains info; Cons: Adds complexity)."
-                        )
+                        quick_fix = _LEAKAGE_CRITICAL_FIX if severity == "critical" else _LEAKAGE_WARNING_FIX
                         issues.append(
                             Issue(
                                 category="target_leakage",
@@ -87,11 +93,7 @@ def _check_target_leakage_patterns(analyzer):
                     )
                     if severity:
                         impact = "high" if severity == "critical" else "medium"
-                        quick_fix = (
-                            "Options: \n- Drop column: Prevents target leakage (Pros: Ensures model integrity; Cons: Loses feature info).\n- Verify feature: Check if correlation is valid or data-derived (Pros: Validates data; Cons: Time-consuming)."
-                            if severity == "critical"
-                            else "Options: \n- Drop column: Reduces leakage risk (Pros: Safer model; Cons: May lose predictive info).\n- Retain and test: Use robust models (e.g., trees) and evaluate (Pros: Keeps potential signal; Cons: Risk of overfitting).\n- Engineer feature: Transform to reduce correlation (Pros: Retains info; Cons: Adds complexity)."
-                        )
+                        quick_fix = _LEAKAGE_CRITICAL_FIX if severity == "critical" else _LEAKAGE_WARNING_FIX
                         issues.append(
                             Issue(
                                 category="target_leakage",
@@ -127,11 +129,7 @@ def _check_target_leakage_patterns(analyzer):
                     )
                     if severity:
                         impact = "high" if severity == "critical" else "medium"
-                        quick_fix = (
-                            "Options: \n- Drop column: Prevents target leakage (Pros: Ensures model integrity; Cons: Loses feature info).\n- Verify feature: Check if correlation is valid or data-derived (Pros: Validates data; Cons: Time-consuming)."
-                            if severity == "critical"
-                            else "Options: \n- Drop column: Reduces leakage risk (Pros: Safer model; Cons: May lose predictive info).\n- Retain and test: Use robust models (e.g., trees) and evaluate (Pros: Keeps potential signal; Cons: Risk of overfitting).\n- Engineer feature: Transform to reduce correlation (Pros: Retains info; Cons: Adds complexity)."
-                        )
+                        quick_fix = _LEAKAGE_CRITICAL_FIX if severity == "critical" else _LEAKAGE_WARNING_FIX
                         issues.append(
                             Issue(
                                 category="target_leakage",
diff --git a/hashprep/reports/base.py b/hashprep/reports/base.py
new file mode 100644
index 0000000..a497dbc
--- /dev/null
+++ b/hashprep/reports/base.py
@@ -0,0 +1,42 @@
+"""Base report class with shared logic for all report renderers."""
+
+
+class BaseReport:
+    ALERT_TYPE_MAPPING = {
+        "feature_correlation": "High Correlation",
+        "categorical_correlation": "High Correlation",
+        "mixed_correlation": "High Correlation",
+        "missing_values": "Missing",
+        "high_missing_values": "Missing",
+        "dataset_missingness": "Missing",
+        "missing_patterns": "Missing",
+        "uniform_distribution": "Uniform",
+        "unique_values": "Unique",
+        "high_zero_counts": "Zeros",
+        "outliers": "Outliers",
+        "skewness": "Skewness",
+        "high_cardinality": "High Cardinality",
+        "duplicates": "Duplicates",
+        "data_leakage": "Leakage",
+        "target_leakage_patterns": "Leakage",
+        "class_imbalance": "Imbalance",
+        "empty_columns": "Empty",
+        "single_value_columns": "Constant",
+        "mixed_data_types": "Mixed Types",
+        "extreme_text_lengths": "Text Length",
+        "datetime_skew": "DateTime Skew",
+        "dataset_drift": "Drift",
+        "infinite_values": "Infinite",
+        "constant_length": "Constant Length",
+        "empty_dataset": "Empty Dataset",
+    }
+
+    def _group_alerts_by_type(self, issues: list[dict]) -> dict[str, list[dict]]:
+        """Group issues into display categories for the alerts section."""
+        groups: dict[str, list[dict]] = {}
+        for issue in issues:
+            alert_type = self.ALERT_TYPE_MAPPING.get(issue["category"], "Other")
+            if alert_type not in groups:
+                groups[alert_type] = []
+            groups[alert_type].append(issue)
+        return groups
diff --git a/hashprep/reports/html.py b/hashprep/reports/html.py
index a86d27f..e45f876 100644
--- a/hashprep/reports/html.py
+++ b/hashprep/reports/html.py
@@ -7,37 +7,10 @@
 
 import hashprep
 
+from .base import BaseReport
 
-class HtmlReport:
-    ALERT_TYPE_MAPPING = {
-        "feature_correlation": "High Correlation",
-        "categorical_correlation": "High Correlation",
-        "mixed_correlation": "High Correlation",
-        "missing_values": "Missing",
-        "high_missing_values": "Missing",
-        "dataset_missingness": "Missing",
-        "missing_patterns": "Missing",
-        "uniform_distribution": "Uniform",
-        "unique_values": "Unique",
-        "high_zero_counts": "Zeros",
-        "outliers": "Outliers",
-        "skewness": "Skewness",
-        "high_cardinality": "High Cardinality",
-        "duplicates": "Duplicates",
-        "data_leakage": "Leakage",
-        "target_leakage_patterns": "Leakage",
-        "class_imbalance": "Imbalance",
-        "empty_columns": "Empty",
-        "single_value_columns": "Constant",
-        "mixed_data_types": "Mixed Types",
-        "extreme_text_lengths": "Text Length",
-        "datetime_skew": "DateTime Skew",
-        "dataset_drift": "Drift",
-        "infinite_values": "Infinite",
-        "constant_length": "Constant Length",
-        "empty_dataset": "Empty Dataset",
-    }
 
+class HtmlReport(BaseReport):
     def generate(self, summary, full=False, output_file=None, theme="minimal", pdf_mode=False):
         template_str = self._get_template(theme)
         template = Template(template_str)
@@ -101,18 +74,6 @@ def generate(self, summary, full=False, output_file=None, theme="minimal", pdf_m
                 f.write(html_content)
         return html_content
 
-    def _group_alerts_by_type(self, issues: list[dict]) -> dict[str, list[dict]]:
-        """Group issues into display categories for the alerts section."""
-        groups: dict[str, list[dict]] = {}
-
-        for issue in issues:
-            alert_type = self.ALERT_TYPE_MAPPING.get(issue["category"], "Other")
-            if alert_type not in groups:
-                groups[alert_type] = []
-            groups[alert_type].append(issue)
-
-        return groups
-
     def _generate_config(self, summary) -> dict:
         """Generate configuration dict for download."""
         reproduction_info = summary["summaries"].get("reproduction_info", {})
diff --git a/hashprep/reports/markdown.py b/hashprep/reports/markdown.py
index 39282fc..1af75c4 100644
--- a/hashprep/reports/markdown.py
+++ b/hashprep/reports/markdown.py
@@ -6,40 +6,12 @@
 import hashprep
 
 from ..utils.logging import get_logger
+from .base import BaseReport
 
 _log = get_logger("reports.markdown")
 
 
-class MarkdownReport:
-    ALERT_TYPE_MAPPING = {
-        "feature_correlation": "High Correlation",
-        "categorical_correlation": "High Correlation",
-        "mixed_correlation": "High Correlation",
-        "missing_values": "Missing",
-        "high_missing_values": "Missing",
-        "dataset_missingness": "Missing",
-        "missing_patterns": "Missing",
-        "uniform_distribution": "Uniform",
-        "unique_values": "Unique",
-        "high_zero_counts": "Zeros",
-        "outliers": "Outliers",
-        "skewness": "Skewness",
-        "high_cardinality": "High Cardinality",
-        "duplicates": "Duplicates",
-        "data_leakage": "Leakage",
-        "target_leakage_patterns": "Leakage",
-        "class_imbalance": "Imbalance",
-        "empty_columns": "Empty",
-        "single_value_columns": "Constant",
-        "mixed_data_types": "Mixed Types",
-        "extreme_text_lengths": "Text Length",
-        "datetime_skew": "DateTime Skew",
-        "dataset_drift": "Drift",
-        "infinite_values": "Infinite",
-        "constant_length": "Constant Length",
-        "empty_dataset": "Empty Dataset",
-    }
-
+class MarkdownReport(BaseReport):
     def generate(self, summary, full=False, output_file=None):
         dataset_info = summary["summaries"]["dataset_info"]
         reproduction_info = summary["summaries"].get("reproduction_info", {})
@@ -275,13 +247,3 @@ def generate(self, summary, full=False, output_file=None):
             with open(output_file, "w") as f:
                 f.write(content)
         return content
-
-    def _group_alerts_by_type(self, issues: list[dict]) -> dict[str, list[dict]]:
-        """Group issues into display categories."""
-        groups: dict[str, list[dict]] = {}
-        for issue in issues:
-            alert_type = self.ALERT_TYPE_MAPPING.get(issue["category"], "Other")
-            if alert_type not in groups:
-                groups[alert_type] = []
-            groups[alert_type].append(issue)
-        return groups
diff --git a/hashprep/reports/pdf.py b/hashprep/reports/pdf.py
index db8dccf..609dfe6 100644
--- a/hashprep/reports/pdf.py
+++ b/hashprep/reports/pdf.py
@@ -6,37 +6,10 @@
 
 import hashprep
 
+from .base import BaseReport
 
-class PdfReport:
-    ALERT_TYPE_MAPPING = {
-        "feature_correlation": "High Correlation",
-        "categorical_correlation": "High Correlation",
-        "mixed_correlation": "High Correlation",
-        "missing_values": "Missing",
-        "high_missing_values": "Missing",
-        "dataset_missingness": "Missing",
-        "missing_patterns": "Missing",
-        "uniform_distribution": "Uniform",
-        "unique_values": "Unique",
-        "high_zero_counts": "Zeros",
-        "outliers": "Outliers",
-        "skewness": "Skewness",
-        "high_cardinality": "High Cardinality",
-        "duplicates": "Duplicates",
-        "data_leakage": "Leakage",
-        "target_leakage_patterns": "Leakage",
-        "class_imbalance": "Imbalance",
-        "empty_columns": "Empty",
-        "single_value_columns": "Constant",
-        "mixed_data_types": "Mixed Types",
-        "extreme_text_lengths": "Text Length",
-        "datetime_skew": "DateTime Skew",
-        "dataset_drift": "Drift",
-        "infinite_values": "Infinite",
-        "constant_length": "Constant Length",
-        "empty_dataset": "Empty Dataset",
-    }
 
+class PdfReport(BaseReport):
     def generate(self, summary, full=False, output_file=None, **kwargs):
         template = Template(self._get_template())
 
@@ -87,15 +60,6 @@ def generate(self, summary, full=False, output_file=None, **kwargs):
                 f.write(pdf_content)
         return pdf_content
 
-    def _group_alerts_by_type(self, issues: list[dict]) -> dict[str, list[dict]]:
-        groups: dict[str, list[dict]] = {}
-        for issue in issues:
-            alert_type = self.ALERT_TYPE_MAPPING.get(issue["category"], "Other")
-            if alert_type not in groups:
-                groups[alert_type] = []
-            groups[alert_type].append(issue)
-        return groups
-
     def _get_template(self):
         return """<!DOCTYPE html>
 <html>
diff --git a/hashprep/summaries/variables.py b/hashprep/summaries/variables.py
index ba019e0..8346a74 100644
--- a/hashprep/summaries/variables.py
+++ b/hashprep/summaries/variables.py
@@ -1,6 +1,6 @@
 import re
 import unicodedata
-from collections import defaultdict
+from collections import Counter, defaultdict
 
 import numpy as np
 import pandas as pd
@@ -181,52 +181,55 @@ def _summarize_text(df, col):
     lengths = series.str.len()
     all_text = "".join(series)
     total_chars = len(all_text)
-    distinct_chars = len(set(all_text))
-    all_categories = [unicodedata.category(c) for c in all_text]
-    cat_series = pd.Series(all_categories)
-    distinct_categories = int(cat_series.nunique())
-    most_occurring_categories = cat_series.value_counts().head(10).to_dict()
-    cat_to_char_count = defaultdict(lambda: defaultdict(int))
+
+    # Single pass: collect char counts, category counts, and per-category char counts
+    char_counts = Counter()
+    cat_counts = Counter()
+    cat_to_char_count = defaultdict(Counter)
     for c in all_text:
+        char_counts[c] += 1
         cat = unicodedata.category(c)
+        cat_counts[cat] += 1
         cat_to_char_count[cat][c] += 1
+
+    distinct_chars = len(char_counts)
+    distinct_categories = len(cat_counts)
+
     most_freq_per_cat = {}
-    for cat, char_count in cat_to_char_count.items():
-        if char_count:
-            top_char = max(char_count, key=char_count.get)
-            count = char_count[top_char]
-            freq = (count / total_chars * 100) if total_chars > 0 else 0
-            most_freq_per_cat[cat] = {
-                "char": top_char,
-                "count": count,
-                "percentage": float(freq),
-            }
-    distinct_scripts = None
-    most_occurring_scripts = None
+    for cat, char_counter in cat_to_char_count.items():
+        top_char, count = char_counter.most_common(1)[0]
+        freq = (count / total_chars * 100) if total_chars > 0 else 0
+        most_freq_per_cat[cat] = {
+            "char": top_char,
+            "count": count,
+            "percentage": float(freq),
+        }
+
+    # Word analysis
     words = re.findall(r"\b\w+\b", all_text.lower())
     word_len = len(words)
-    word_vc = pd.Series(words).value_counts().head(10)
     words_dict = {
         w: {
-            "count": int(c),
+            "count": c,
             "frequency": float(c / word_len * 100) if word_len > 0 else 0.0,
         }
-        for w, c in word_vc.items()
+        for w, c in Counter(words).most_common(10)
     }
-    char_vc = pd.Series(list(all_text)).value_counts().head(10)
+
+    # Top characters and categories
     char_dict = {
         str(c): {
-            "count": int(v),
+            "count": v,
             "percentage": float(v / total_chars * 100) if total_chars > 0 else 0.0,
         }
-        for c, v in char_vc.items()
+        for c, v in char_counts.most_common(10)
     }
     cat_dict = {
         k: {
             "count": v,
             "percentage": float(v / total_chars * 100) if total_chars > 0 else 0.0,
         }
-        for k, v in most_occurring_categories.items()
+        for k, v in cat_counts.most_common(10)
     }
     sample = [str(s) for s in series.head(5).tolist()]
     stats = {
@@ -241,7 +244,7 @@ def _summarize_text(df, col):
                 "total_characters": total_chars,
                 "distinct_characters": distinct_chars,
                 "distinct_categories": distinct_categories,
-                "distinct_scripts": distinct_scripts,
+                "distinct_scripts": None,
                 "distinct_blocks": None,
             },
             "sample": sample,
@@ -254,7 +257,7 @@ def _summarize_text(df, col):
                 "most_frequent_character_per_category": most_freq_per_cat,
             },
             "scripts": {
-                "most_occurring_scripts": most_occurring_scripts,
+                "most_occurring_scripts": None,
                 "most_frequent_character_per_script": None,
             },
             "blocks": {

From f1ef5483238dd91b02a0e5f7a2548d7bb1a97a86 Mon Sep 17 00:00:00 2001
From: maskedsyntax <aftaab@aftaab.xyz>
Date: Thu, 19 Feb 2026 21:47:03 +0530
Subject: [PATCH 2/2] chore: bump version to 0.1.0 (stable release)

---
 README.md            | 2 +-
 hashprep/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 694026a..95ba85e 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@
 </div>
 
 > [!NOTE]
-> HashPrep v0.1.0 is now stable. Core features are fully tested with CI. The API may still evolve based on community feedback.
+> HashPrep is in **beta** (v0.1.0b2). Core features are fully tested with CI. The API may still evolve based on community feedback.
 
 ## Overview
 
diff --git a/hashprep/__init__.py b/hashprep/__init__.py
index 231a028..6534f0d 100644
--- a/hashprep/__init__.py
+++ b/hashprep/__init__.py
@@ -1,3 +1,3 @@
 from .core.analyzer import DatasetAnalyzer as DatasetAnalyzer
 
-__version__ = "0.1.0b1"
+__version__ = "0.1.0b2"