cachevector · maskedsyntax · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026
diff --git a/README.md b/README.md
@@ -11,10 +11,11 @@
   <p align="center">
     <!-- Distribution -->
     <img src="https://img.shields.io/pypi/v/hashprep?color=blue&label=PyPI" />
-    <!-- <img src="https://img.shields.io/badge/PyPI-Coming%20Soon-blue" /> -->
     <!-- License -->
     <img src="https://img.shields.io/badge/License-MIT-green" />
     <img src="https://img.shields.io/badge/CLI-Supported-orange" />
+    <!-- CI -->
+    <a href="https://github.com/cachevector/hashprep/actions/workflows/ci.yml"><img src="https://github.com/cachevector/hashprep/actions/workflows/ci.yml/badge.svg" /></a>
   </p>
   <p>
     <!-- Features -->
@@ -26,7 +27,7 @@
 </div>
 
 > [!NOTE]
-> HashPrep is now in **beta** (v0.1.0b1). Core features are stable and tested, but the API may still evolve based on community feedback. Ready for testing in real-world ML workflows.
+> HashPrep is in **beta** (v0.1.0b2). Core features are fully tested with CI. The API may still evolve based on community feedback.
 
 ## Overview
 
@@ -165,19 +166,31 @@ hashprep version
 ```
 
 #### Available Checks
-- `outliers` - Detect outliers using IQR method
+- `outliers` - Detect outliers using z-score
 - `duplicates` - Find duplicate rows
-- `high_missing_values` - Columns with >50% missing data
+- `high_missing_values` - Columns with high missing data
+- `empty_columns` - Completely empty columns
 - `dataset_missingness` - Overall missing data patterns
+- `missing_patterns` - Correlated missing value patterns
 - `high_cardinality` - Categorical columns with too many unique values
 - `single_value_columns` - Constant columns with no variance
+- `mixed_data_types` - Columns with mixed data types
 - `class_imbalance` - Imbalanced target variable (requires --target)
-- `feature_correlation` - Highly correlated features
-- `target_leakage` - Features that may leak target information
+- `feature_correlation` - Highly correlated numeric features
+- `categorical_correlation` - Highly associated categorical features
+- `mixed_correlation` - Numeric-categorical associations
+- `data_leakage` - Columns identical to target
+- `target_leakage_patterns` - Features that may leak target information
 - `dataset_drift` - Distribution drift between datasets (requires --comparison)
 - `uniform_distribution` - Uniformly distributed numeric columns
 - `unique_values` - Columns where >95% values are unique
-- `many_zeros` - Columns with excessive zero values
+- `high_zero_counts` - Columns with excessive zero values
+- `skewness` - Highly skewed numeric distributions
+- `infinite_values` - Columns containing infinite values
+- `constant_length` - String columns with constant character length
+- `extreme_text_lengths` - Text columns with extreme value lengths
+- `datetime_skew` - Datetime columns concentrated in one period
+- `empty_dataset` - Empty or all-missing datasets
 
 ---
 

diff --git a/hashprep/__init__.py b/hashprep/__init__.py
@@ -1,3 +1,3 @@
 from .core.analyzer import DatasetAnalyzer as DatasetAnalyzer
 
-__version__ = "0.1.0b1"
+__version__ = "0.1.0b2"
diff --git a/hashprep/checks/leakage.py b/hashprep/checks/leakage.py
@@ -9,6 +9,16 @@
 _LEAK = DEFAULT_CONFIG.leakage
 _log = get_logger("checks.leakage")
 
+_LEAKAGE_CRITICAL_FIX = (
+    "Options: \n- Drop column: Prevents target leakage (Pros: Ensures model integrity; Cons: Loses feature info)."
+    "\n- Verify feature: Check if correlation is valid or data-derived (Pros: Validates data; Cons: Time-consuming)."
+)
+_LEAKAGE_WARNING_FIX = (
+    "Options: \n- Drop column: Reduces leakage risk (Pros: Safer model; Cons: May lose predictive info)."
+    "\n- Retain and test: Use robust models (e.g., trees) and evaluate (Pros: Keeps potential signal; Cons: Risk of overfitting)."
+    "\n- Engineer feature: Transform to reduce correlation (Pros: Retains info; Cons: Adds complexity)."
+)
+
 
 def _check_data_leakage(analyzer):
     issues = []
@@ -52,11 +62,7 @@ def _check_target_leakage_patterns(analyzer):
                     )
                     if severity:
                         impact = "high" if severity == "critical" else "medium"
-                        quick_fix = (
-                            "Options: \n- Drop column: Prevents target leakage (Pros: Ensures model integrity; Cons: Loses feature info).\n- Verify feature: Check if correlation is valid or data-derived (Pros: Validates data; Cons: Time-consuming)."
-                            if severity == "critical"
-                            else "Options: \n- Drop column: Reduces leakage risk (Pros: Safer model; Cons: May lose predictive info).\n- Retain and test: Use robust models (e.g., trees) and evaluate (Pros: Keeps potential signal; Cons: Risk of overfitting).\n- Engineer feature: Transform to reduce correlation (Pros: Retains info; Cons: Adds complexity)."
-                        )
+                        quick_fix = _LEAKAGE_CRITICAL_FIX if severity == "critical" else _LEAKAGE_WARNING_FIX
                         issues.append(
                             Issue(
                                 category="target_leakage",
@@ -87,11 +93,7 @@ def _check_target_leakage_patterns(analyzer):
                     )
                     if severity:
                         impact = "high" if severity == "critical" else "medium"
-                        quick_fix = (
-                            "Options: \n- Drop column: Prevents target leakage (Pros: Ensures model integrity; Cons: Loses feature info).\n- Verify feature: Check if correlation is valid or data-derived (Pros: Validates data; Cons: Time-consuming)."
-                            if severity == "critical"
-                            else "Options: \n- Drop column: Reduces leakage risk (Pros: Safer model; Cons: May lose predictive info).\n- Retain and test: Use robust models (e.g., trees) and evaluate (Pros: Keeps potential signal; Cons: Risk of overfitting).\n- Engineer feature: Transform to reduce correlation (Pros: Retains info; Cons: Adds complexity)."
-                        )
+                        quick_fix = _LEAKAGE_CRITICAL_FIX if severity == "critical" else _LEAKAGE_WARNING_FIX
                         issues.append(
                             Issue(
                                 category="target_leakage",
@@ -127,11 +129,7 @@ def _check_target_leakage_patterns(analyzer):
                     )
                     if severity:
                         impact = "high" if severity == "critical" else "medium"
-                        quick_fix = (
-                            "Options: \n- Drop column: Prevents target leakage (Pros: Ensures model integrity; Cons: Loses feature info).\n- Verify feature: Check if correlation is valid or data-derived (Pros: Validates data; Cons: Time-consuming)."
-                            if severity == "critical"
-                            else "Options: \n- Drop column: Reduces leakage risk (Pros: Safer model; Cons: May lose predictive info).\n- Retain and test: Use robust models (e.g., trees) and evaluate (Pros: Keeps potential signal; Cons: Risk of overfitting).\n- Engineer feature: Transform to reduce correlation (Pros: Retains info; Cons: Adds complexity)."
-                        )
+                        quick_fix = _LEAKAGE_CRITICAL_FIX if severity == "critical" else _LEAKAGE_WARNING_FIX
                         issues.append(
                             Issue(
                                 category="target_leakage",

diff --git a/hashprep/reports/base.py b/hashprep/reports/base.py
@@ -0,0 +1,42 @@
+"""Base report class with shared logic for all report renderers."""
+
+
+class BaseReport:
+    ALERT_TYPE_MAPPING = {
+        "feature_correlation": "High Correlation",
+        "categorical_correlation": "High Correlation",
+        "mixed_correlation": "High Correlation",
+        "missing_values": "Missing",
+        "high_missing_values": "Missing",
+        "dataset_missingness": "Missing",
+        "missing_patterns": "Missing",
+        "uniform_distribution": "Uniform",
+        "unique_values": "Unique",
+        "high_zero_counts": "Zeros",
+        "outliers": "Outliers",
+        "skewness": "Skewness",
+        "high_cardinality": "High Cardinality",
+        "duplicates": "Duplicates",
+        "data_leakage": "Leakage",
+        "target_leakage_patterns": "Leakage",
+        "class_imbalance": "Imbalance",
+        "empty_columns": "Empty",
+        "single_value_columns": "Constant",
+        "mixed_data_types": "Mixed Types",
+        "extreme_text_lengths": "Text Length",
+        "datetime_skew": "DateTime Skew",
+        "dataset_drift": "Drift",
+        "infinite_values": "Infinite",
+        "constant_length": "Constant Length",
+        "empty_dataset": "Empty Dataset",
+    }
+
+    def _group_alerts_by_type(self, issues: list[dict]) -> dict[str, list[dict]]:
+        """Group issues into display categories for the alerts section."""
+        groups: dict[str, list[dict]] = {}
+        for issue in issues:
+            alert_type = self.ALERT_TYPE_MAPPING.get(issue["category"], "Other")
+            if alert_type not in groups:
+                groups[alert_type] = []
+            groups[alert_type].append(issue)
+        return groups
diff --git a/hashprep/reports/html.py b/hashprep/reports/html.py
@@ -7,37 +7,10 @@
 
 import hashprep
 
+from .base import BaseReport
 
-class HtmlReport:
-    ALERT_TYPE_MAPPING = {
-        "feature_correlation": "High Correlation",
-        "categorical_correlation": "High Correlation",
-        "mixed_correlation": "High Correlation",
-        "missing_values": "Missing",
-        "high_missing_values": "Missing",
-        "dataset_missingness": "Missing",
-        "missing_patterns": "Missing",
-        "uniform_distribution": "Uniform",
-        "unique_values": "Unique",
-        "high_zero_counts": "Zeros",
-        "outliers": "Outliers",
-        "skewness": "Skewness",
-        "high_cardinality": "High Cardinality",
-        "duplicates": "Duplicates",
-        "data_leakage": "Leakage",
-        "target_leakage_patterns": "Leakage",
-        "class_imbalance": "Imbalance",
-        "empty_columns": "Empty",
-        "single_value_columns": "Constant",
-        "mixed_data_types": "Mixed Types",
-        "extreme_text_lengths": "Text Length",
-        "datetime_skew": "DateTime Skew",
-        "dataset_drift": "Drift",
-        "infinite_values": "Infinite",
-        "constant_length": "Constant Length",
-        "empty_dataset": "Empty Dataset",
-    }
 
+class HtmlReport(BaseReport):
     def generate(self, summary, full=False, output_file=None, theme="minimal", pdf_mode=False):
         template_str = self._get_template(theme)
         template = Template(template_str)
@@ -101,18 +74,6 @@ def generate(self, summary, full=False, output_file=None, theme="minimal", pdf_m
                 f.write(html_content)
         return html_content
 
-    def _group_alerts_by_type(self, issues: list[dict]) -> dict[str, list[dict]]:
-        """Group issues into display categories for the alerts section."""
-        groups: dict[str, list[dict]] = {}
-
-        for issue in issues:
-            alert_type = self.ALERT_TYPE_MAPPING.get(issue["category"], "Other")
-            if alert_type not in groups:
-                groups[alert_type] = []
-            groups[alert_type].append(issue)
-
-        return groups
-
     def _generate_config(self, summary) -> dict:
         """Generate configuration dict for download."""
         reproduction_info = summary["summaries"].get("reproduction_info", {})

diff --git a/hashprep/reports/markdown.py b/hashprep/reports/markdown.py
@@ -6,40 +6,12 @@
 import hashprep
 
 from ..utils.logging import get_logger
+from .base import BaseReport
 
 _log = get_logger("reports.markdown")
 
 
-class MarkdownReport:
-    ALERT_TYPE_MAPPING = {
-        "feature_correlation": "High Correlation",
-        "categorical_correlation": "High Correlation",
-        "mixed_correlation": "High Correlation",
-        "missing_values": "Missing",
-        "high_missing_values": "Missing",
-        "dataset_missingness": "Missing",
-        "missing_patterns": "Missing",
-        "uniform_distribution": "Uniform",
-        "unique_values": "Unique",
-        "high_zero_counts": "Zeros",
-        "outliers": "Outliers",
-        "skewness": "Skewness",
-        "high_cardinality": "High Cardinality",
-        "duplicates": "Duplicates",
-        "data_leakage": "Leakage",
-        "target_leakage_patterns": "Leakage",
-        "class_imbalance": "Imbalance",
-        "empty_columns": "Empty",
-        "single_value_columns": "Constant",
-        "mixed_data_types": "Mixed Types",
-        "extreme_text_lengths": "Text Length",
-        "datetime_skew": "DateTime Skew",
-        "dataset_drift": "Drift",
-        "infinite_values": "Infinite",
-        "constant_length": "Constant Length",
-        "empty_dataset": "Empty Dataset",
-    }
-
+class MarkdownReport(BaseReport):
     def generate(self, summary, full=False, output_file=None):
         dataset_info = summary["summaries"]["dataset_info"]
         reproduction_info = summary["summaries"].get("reproduction_info", {})
@@ -275,13 +247,3 @@ def generate(self, summary, full=False, output_file=None):
             with open(output_file, "w") as f:
                 f.write(content)
         return content
-
-    def _group_alerts_by_type(self, issues: list[dict]) -> dict[str, list[dict]]:
-        """Group issues into display categories."""
-        groups: dict[str, list[dict]] = {}
-        for issue in issues:
-            alert_type = self.ALERT_TYPE_MAPPING.get(issue["category"], "Other")
-            if alert_type not in groups:
-                groups[alert_type] = []
-            groups[alert_type].append(issue)
-        return groups
diff --git a/hashprep/reports/pdf.py b/hashprep/reports/pdf.py
@@ -6,37 +6,10 @@
 
 import hashprep
 
+from .base import BaseReport
 
-class PdfReport:
-    ALERT_TYPE_MAPPING = {
-        "feature_correlation": "High Correlation",
-        "categorical_correlation": "High Correlation",
-        "mixed_correlation": "High Correlation",
-        "missing_values": "Missing",
-        "high_missing_values": "Missing",
-        "dataset_missingness": "Missing",
-        "missing_patterns": "Missing",
-        "uniform_distribution": "Uniform",
-        "unique_values": "Unique",
-        "high_zero_counts": "Zeros",
-        "outliers": "Outliers",
-        "skewness": "Skewness",
-        "high_cardinality": "High Cardinality",
-        "duplicates": "Duplicates",
-        "data_leakage": "Leakage",
-        "target_leakage_patterns": "Leakage",
-        "class_imbalance": "Imbalance",
-        "empty_columns": "Empty",
-        "single_value_columns": "Constant",
-        "mixed_data_types": "Mixed Types",
-        "extreme_text_lengths": "Text Length",
-        "datetime_skew": "DateTime Skew",
-        "dataset_drift": "Drift",
-        "infinite_values": "Infinite",
-        "constant_length": "Constant Length",
-        "empty_dataset": "Empty Dataset",
-    }
 
+class PdfReport(BaseReport):
     def generate(self, summary, full=False, output_file=None, **kwargs):
         template = Template(self._get_template())
 
@@ -87,15 +60,6 @@ def generate(self, summary, full=False, output_file=None, **kwargs):
                 f.write(pdf_content)
         return pdf_content
 
-    def _group_alerts_by_type(self, issues: list[dict]) -> dict[str, list[dict]]:
-        groups: dict[str, list[dict]] = {}
-        for issue in issues:
-            alert_type = self.ALERT_TYPE_MAPPING.get(issue["category"], "Other")
-            if alert_type not in groups:
-                groups[alert_type] = []
-            groups[alert_type].append(issue)
-        return groups
-
     def _get_template(self):
         return """<!DOCTYPE html>
 <html>