From 5d417bb8cc9edea11e65298900ffa7308560573c Mon Sep 17 00:00:00 2001 From: maskedsyntax Date: Thu, 19 Feb 2026 21:46:44 +0530 Subject: [PATCH 1/2] refactor: deduplicate report renderers, optimize text summarization, update README - Extract BaseReport class with shared ALERT_TYPE_MAPPING and _group_alerts_by_type() (removed ~100 lines of copy-paste across 3 files) - Optimize _summarize_text() with single-pass Counter-based analysis (eliminates double unicodedata.category() calls and pd.Series copies) - Extract duplicated leakage quick_fix strings to module constants - Update README: stable release note, CI badge, complete checks list (25 checks) --- README.md | 27 +++++++++++---- hashprep/checks/leakage.py | 28 ++++++++-------- hashprep/reports/base.py | 42 +++++++++++++++++++++++ hashprep/reports/html.py | 43 ++---------------------- hashprep/reports/markdown.py | 42 ++--------------------- hashprep/reports/pdf.py | 40 ++-------------------- hashprep/summaries/variables.py | 59 +++++++++++++++++---------------- 7 files changed, 112 insertions(+), 169 deletions(-) create mode 100644 hashprep/reports/base.py diff --git a/README.md b/README.md index 0e26d77..694026a 100644 --- a/README.md +++ b/README.md @@ -11,10 +11,11 @@

- + +

@@ -26,7 +27,7 @@ > [!NOTE] -> HashPrep is now in **beta** (v0.1.0b1). Core features are stable and tested, but the API may still evolve based on community feedback. Ready for testing in real-world ML workflows. +> HashPrep v0.1.0 is now stable. Core features are fully tested with CI. The API may still evolve based on community feedback. ## Overview @@ -165,19 +166,31 @@ hashprep version ``` #### Available Checks -- `outliers` - Detect outliers using IQR method +- `outliers` - Detect outliers using z-score - `duplicates` - Find duplicate rows -- `high_missing_values` - Columns with >50% missing data +- `high_missing_values` - Columns with high missing data +- `empty_columns` - Completely empty columns - `dataset_missingness` - Overall missing data patterns +- `missing_patterns` - Correlated missing value patterns - `high_cardinality` - Categorical columns with too many unique values - `single_value_columns` - Constant columns with no variance +- `mixed_data_types` - Columns with mixed data types - `class_imbalance` - Imbalanced target variable (requires --target) -- `feature_correlation` - Highly correlated features -- `target_leakage` - Features that may leak target information +- `feature_correlation` - Highly correlated numeric features +- `categorical_correlation` - Highly associated categorical features +- `mixed_correlation` - Numeric-categorical associations +- `data_leakage` - Columns identical to target +- `target_leakage_patterns` - Features that may leak target information - `dataset_drift` - Distribution drift between datasets (requires --comparison) - `uniform_distribution` - Uniformly distributed numeric columns - `unique_values` - Columns where >95% values are unique -- `many_zeros` - Columns with excessive zero values +- `high_zero_counts` - Columns with excessive zero values +- `skewness` - Highly skewed numeric distributions +- `infinite_values` - Columns containing infinite values +- `constant_length` - String columns with constant character length +- `extreme_text_lengths` - Text columns with extreme value lengths +- `datetime_skew` - Datetime columns concentrated in one period +- `empty_dataset` - Empty or all-missing datasets --- diff --git a/hashprep/checks/leakage.py b/hashprep/checks/leakage.py index cdfa429..c98e650 100644 --- a/hashprep/checks/leakage.py +++ b/hashprep/checks/leakage.py @@ -9,6 +9,16 @@ _LEAK = DEFAULT_CONFIG.leakage _log = get_logger("checks.leakage") +_LEAKAGE_CRITICAL_FIX = ( + "Options: \n- Drop column: Prevents target leakage (Pros: Ensures model integrity; Cons: Loses feature info)." + "\n- Verify feature: Check if correlation is valid or data-derived (Pros: Validates data; Cons: Time-consuming)." +) +_LEAKAGE_WARNING_FIX = ( + "Options: \n- Drop column: Reduces leakage risk (Pros: Safer model; Cons: May lose predictive info)." + "\n- Retain and test: Use robust models (e.g., trees) and evaluate (Pros: Keeps potential signal; Cons: Risk of overfitting)." + "\n- Engineer feature: Transform to reduce correlation (Pros: Retains info; Cons: Adds complexity)." +) + def _check_data_leakage(analyzer): issues = [] @@ -52,11 +62,7 @@ def _check_target_leakage_patterns(analyzer): ) if severity: impact = "high" if severity == "critical" else "medium" - quick_fix = ( - "Options: \n- Drop column: Prevents target leakage (Pros: Ensures model integrity; Cons: Loses feature info).\n- Verify feature: Check if correlation is valid or data-derived (Pros: Validates data; Cons: Time-consuming)." - if severity == "critical" - else "Options: \n- Drop column: Reduces leakage risk (Pros: Safer model; Cons: May lose predictive info).\n- Retain and test: Use robust models (e.g., trees) and evaluate (Pros: Keeps potential signal; Cons: Risk of overfitting).\n- Engineer feature: Transform to reduce correlation (Pros: Retains info; Cons: Adds complexity)." - ) + quick_fix = _LEAKAGE_CRITICAL_FIX if severity == "critical" else _LEAKAGE_WARNING_FIX issues.append( Issue( category="target_leakage", @@ -87,11 +93,7 @@ def _check_target_leakage_patterns(analyzer): ) if severity: impact = "high" if severity == "critical" else "medium" - quick_fix = ( - "Options: \n- Drop column: Prevents target leakage (Pros: Ensures model integrity; Cons: Loses feature info).\n- Verify feature: Check if correlation is valid or data-derived (Pros: Validates data; Cons: Time-consuming)." - if severity == "critical" - else "Options: \n- Drop column: Reduces leakage risk (Pros: Safer model; Cons: May lose predictive info).\n- Retain and test: Use robust models (e.g., trees) and evaluate (Pros: Keeps potential signal; Cons: Risk of overfitting).\n- Engineer feature: Transform to reduce correlation (Pros: Retains info; Cons: Adds complexity)." - ) + quick_fix = _LEAKAGE_CRITICAL_FIX if severity == "critical" else _LEAKAGE_WARNING_FIX issues.append( Issue( category="target_leakage", @@ -127,11 +129,7 @@ def _check_target_leakage_patterns(analyzer): ) if severity: impact = "high" if severity == "critical" else "medium" - quick_fix = ( - "Options: \n- Drop column: Prevents target leakage (Pros: Ensures model integrity; Cons: Loses feature info).\n- Verify feature: Check if correlation is valid or data-derived (Pros: Validates data; Cons: Time-consuming)." - if severity == "critical" - else "Options: \n- Drop column: Reduces leakage risk (Pros: Safer model; Cons: May lose predictive info).\n- Retain and test: Use robust models (e.g., trees) and evaluate (Pros: Keeps potential signal; Cons: Risk of overfitting).\n- Engineer feature: Transform to reduce correlation (Pros: Retains info; Cons: Adds complexity)." - ) + quick_fix = _LEAKAGE_CRITICAL_FIX if severity == "critical" else _LEAKAGE_WARNING_FIX issues.append( Issue( category="target_leakage", diff --git a/hashprep/reports/base.py b/hashprep/reports/base.py new file mode 100644 index 0000000..a497dbc --- /dev/null +++ b/hashprep/reports/base.py @@ -0,0 +1,42 @@ +"""Base report class with shared logic for all report renderers.""" + + +class BaseReport: + ALERT_TYPE_MAPPING = { + "feature_correlation": "High Correlation", + "categorical_correlation": "High Correlation", + "mixed_correlation": "High Correlation", + "missing_values": "Missing", + "high_missing_values": "Missing", + "dataset_missingness": "Missing", + "missing_patterns": "Missing", + "uniform_distribution": "Uniform", + "unique_values": "Unique", + "high_zero_counts": "Zeros", + "outliers": "Outliers", + "skewness": "Skewness", + "high_cardinality": "High Cardinality", + "duplicates": "Duplicates", + "data_leakage": "Leakage", + "target_leakage_patterns": "Leakage", + "class_imbalance": "Imbalance", + "empty_columns": "Empty", + "single_value_columns": "Constant", + "mixed_data_types": "Mixed Types", + "extreme_text_lengths": "Text Length", + "datetime_skew": "DateTime Skew", + "dataset_drift": "Drift", + "infinite_values": "Infinite", + "constant_length": "Constant Length", + "empty_dataset": "Empty Dataset", + } + + def _group_alerts_by_type(self, issues: list[dict]) -> dict[str, list[dict]]: + """Group issues into display categories for the alerts section.""" + groups: dict[str, list[dict]] = {} + for issue in issues: + alert_type = self.ALERT_TYPE_MAPPING.get(issue["category"], "Other") + if alert_type not in groups: + groups[alert_type] = [] + groups[alert_type].append(issue) + return groups diff --git a/hashprep/reports/html.py b/hashprep/reports/html.py index a86d27f..e45f876 100644 --- a/hashprep/reports/html.py +++ b/hashprep/reports/html.py @@ -7,37 +7,10 @@ import hashprep +from .base import BaseReport -class HtmlReport: - ALERT_TYPE_MAPPING = { - "feature_correlation": "High Correlation", - "categorical_correlation": "High Correlation", - "mixed_correlation": "High Correlation", - "missing_values": "Missing", - "high_missing_values": "Missing", - "dataset_missingness": "Missing", - "missing_patterns": "Missing", - "uniform_distribution": "Uniform", - "unique_values": "Unique", - "high_zero_counts": "Zeros", - "outliers": "Outliers", - "skewness": "Skewness", - "high_cardinality": "High Cardinality", - "duplicates": "Duplicates", - "data_leakage": "Leakage", - "target_leakage_patterns": "Leakage", - "class_imbalance": "Imbalance", - "empty_columns": "Empty", - "single_value_columns": "Constant", - "mixed_data_types": "Mixed Types", - "extreme_text_lengths": "Text Length", - "datetime_skew": "DateTime Skew", - "dataset_drift": "Drift", - "infinite_values": "Infinite", - "constant_length": "Constant Length", - "empty_dataset": "Empty Dataset", - } +class HtmlReport(BaseReport): def generate(self, summary, full=False, output_file=None, theme="minimal", pdf_mode=False): template_str = self._get_template(theme) template = Template(template_str) @@ -101,18 +74,6 @@ def generate(self, summary, full=False, output_file=None, theme="minimal", pdf_m f.write(html_content) return html_content - def _group_alerts_by_type(self, issues: list[dict]) -> dict[str, list[dict]]: - """Group issues into display categories for the alerts section.""" - groups: dict[str, list[dict]] = {} - - for issue in issues: - alert_type = self.ALERT_TYPE_MAPPING.get(issue["category"], "Other") - if alert_type not in groups: - groups[alert_type] = [] - groups[alert_type].append(issue) - - return groups - def _generate_config(self, summary) -> dict: """Generate configuration dict for download.""" reproduction_info = summary["summaries"].get("reproduction_info", {}) diff --git a/hashprep/reports/markdown.py b/hashprep/reports/markdown.py index 39282fc..1af75c4 100644 --- a/hashprep/reports/markdown.py +++ b/hashprep/reports/markdown.py @@ -6,40 +6,12 @@ import hashprep from ..utils.logging import get_logger +from .base import BaseReport _log = get_logger("reports.markdown") -class MarkdownReport: - ALERT_TYPE_MAPPING = { - "feature_correlation": "High Correlation", - "categorical_correlation": "High Correlation", - "mixed_correlation": "High Correlation", - "missing_values": "Missing", - "high_missing_values": "Missing", - "dataset_missingness": "Missing", - "missing_patterns": "Missing", - "uniform_distribution": "Uniform", - "unique_values": "Unique", - "high_zero_counts": "Zeros", - "outliers": "Outliers", - "skewness": "Skewness", - "high_cardinality": "High Cardinality", - "duplicates": "Duplicates", - "data_leakage": "Leakage", - "target_leakage_patterns": "Leakage", - "class_imbalance": "Imbalance", - "empty_columns": "Empty", - "single_value_columns": "Constant", - "mixed_data_types": "Mixed Types", - "extreme_text_lengths": "Text Length", - "datetime_skew": "DateTime Skew", - "dataset_drift": "Drift", - "infinite_values": "Infinite", - "constant_length": "Constant Length", - "empty_dataset": "Empty Dataset", - } - +class MarkdownReport(BaseReport): def generate(self, summary, full=False, output_file=None): dataset_info = summary["summaries"]["dataset_info"] reproduction_info = summary["summaries"].get("reproduction_info", {}) @@ -275,13 +247,3 @@ def generate(self, summary, full=False, output_file=None): with open(output_file, "w") as f: f.write(content) return content - - def _group_alerts_by_type(self, issues: list[dict]) -> dict[str, list[dict]]: - """Group issues into display categories.""" - groups: dict[str, list[dict]] = {} - for issue in issues: - alert_type = self.ALERT_TYPE_MAPPING.get(issue["category"], "Other") - if alert_type not in groups: - groups[alert_type] = [] - groups[alert_type].append(issue) - return groups diff --git a/hashprep/reports/pdf.py b/hashprep/reports/pdf.py index db8dccf..609dfe6 100644 --- a/hashprep/reports/pdf.py +++ b/hashprep/reports/pdf.py @@ -6,37 +6,10 @@ import hashprep +from .base import BaseReport -class PdfReport: - ALERT_TYPE_MAPPING = { - "feature_correlation": "High Correlation", - "categorical_correlation": "High Correlation", - "mixed_correlation": "High Correlation", - "missing_values": "Missing", - "high_missing_values": "Missing", - "dataset_missingness": "Missing", - "missing_patterns": "Missing", - "uniform_distribution": "Uniform", - "unique_values": "Unique", - "high_zero_counts": "Zeros", - "outliers": "Outliers", - "skewness": "Skewness", - "high_cardinality": "High Cardinality", - "duplicates": "Duplicates", - "data_leakage": "Leakage", - "target_leakage_patterns": "Leakage", - "class_imbalance": "Imbalance", - "empty_columns": "Empty", - "single_value_columns": "Constant", - "mixed_data_types": "Mixed Types", - "extreme_text_lengths": "Text Length", - "datetime_skew": "DateTime Skew", - "dataset_drift": "Drift", - "infinite_values": "Infinite", - "constant_length": "Constant Length", - "empty_dataset": "Empty Dataset", - } +class PdfReport(BaseReport): def generate(self, summary, full=False, output_file=None, **kwargs): template = Template(self._get_template()) @@ -87,15 +60,6 @@ def generate(self, summary, full=False, output_file=None, **kwargs): f.write(pdf_content) return pdf_content - def _group_alerts_by_type(self, issues: list[dict]) -> dict[str, list[dict]]: - groups: dict[str, list[dict]] = {} - for issue in issues: - alert_type = self.ALERT_TYPE_MAPPING.get(issue["category"], "Other") - if alert_type not in groups: - groups[alert_type] = [] - groups[alert_type].append(issue) - return groups - def _get_template(self): return """ diff --git a/hashprep/summaries/variables.py b/hashprep/summaries/variables.py index ba019e0..8346a74 100644 --- a/hashprep/summaries/variables.py +++ b/hashprep/summaries/variables.py @@ -1,6 +1,6 @@ import re import unicodedata -from collections import defaultdict +from collections import Counter, defaultdict import numpy as np import pandas as pd @@ -181,52 +181,55 @@ def _summarize_text(df, col): lengths = series.str.len() all_text = "".join(series) total_chars = len(all_text) - distinct_chars = len(set(all_text)) - all_categories = [unicodedata.category(c) for c in all_text] - cat_series = pd.Series(all_categories) - distinct_categories = int(cat_series.nunique()) - most_occurring_categories = cat_series.value_counts().head(10).to_dict() - cat_to_char_count = defaultdict(lambda: defaultdict(int)) + + # Single pass: collect char counts, category counts, and per-category char counts + char_counts = Counter() + cat_counts = Counter() + cat_to_char_count = defaultdict(Counter) for c in all_text: + char_counts[c] += 1 cat = unicodedata.category(c) + cat_counts[cat] += 1 cat_to_char_count[cat][c] += 1 + + distinct_chars = len(char_counts) + distinct_categories = len(cat_counts) + most_freq_per_cat = {} - for cat, char_count in cat_to_char_count.items(): - if char_count: - top_char = max(char_count, key=char_count.get) - count = char_count[top_char] - freq = (count / total_chars * 100) if total_chars > 0 else 0 - most_freq_per_cat[cat] = { - "char": top_char, - "count": count, - "percentage": float(freq), - } - distinct_scripts = None - most_occurring_scripts = None + for cat, char_counter in cat_to_char_count.items(): + top_char, count = char_counter.most_common(1)[0] + freq = (count / total_chars * 100) if total_chars > 0 else 0 + most_freq_per_cat[cat] = { + "char": top_char, + "count": count, + "percentage": float(freq), + } + + # Word analysis words = re.findall(r"\b\w+\b", all_text.lower()) word_len = len(words) - word_vc = pd.Series(words).value_counts().head(10) words_dict = { w: { - "count": int(c), + "count": c, "frequency": float(c / word_len * 100) if word_len > 0 else 0.0, } - for w, c in word_vc.items() + for w, c in Counter(words).most_common(10) } - char_vc = pd.Series(list(all_text)).value_counts().head(10) + + # Top characters and categories char_dict = { str(c): { - "count": int(v), + "count": v, "percentage": float(v / total_chars * 100) if total_chars > 0 else 0.0, } - for c, v in char_vc.items() + for c, v in char_counts.most_common(10) } cat_dict = { k: { "count": v, "percentage": float(v / total_chars * 100) if total_chars > 0 else 0.0, } - for k, v in most_occurring_categories.items() + for k, v in cat_counts.most_common(10) } sample = [str(s) for s in series.head(5).tolist()] stats = { @@ -241,7 +244,7 @@ def _summarize_text(df, col): "total_characters": total_chars, "distinct_characters": distinct_chars, "distinct_categories": distinct_categories, - "distinct_scripts": distinct_scripts, + "distinct_scripts": None, "distinct_blocks": None, }, "sample": sample, @@ -254,7 +257,7 @@ def _summarize_text(df, col): "most_frequent_character_per_category": most_freq_per_cat, }, "scripts": { - "most_occurring_scripts": most_occurring_scripts, + "most_occurring_scripts": None, "most_frequent_character_per_script": None, }, "blocks": { From f1ef5483238dd91b02a0e5f7a2548d7bb1a97a86 Mon Sep 17 00:00:00 2001 From: maskedsyntax Date: Thu, 19 Feb 2026 21:47:03 +0530 Subject: [PATCH 2/2] chore: bump version to 0.1.0 (stable release) --- README.md | 2 +- hashprep/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 694026a..95ba85e 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ > [!NOTE] -> HashPrep v0.1.0 is now stable. Core features are fully tested with CI. The API may still evolve based on community feedback. +> HashPrep is in **beta** (v0.1.0b2). Core features are fully tested with CI. The API may still evolve based on community feedback. ## Overview diff --git a/hashprep/__init__.py b/hashprep/__init__.py index 231a028..6534f0d 100644 --- a/hashprep/__init__.py +++ b/hashprep/__init__.py @@ -1,3 +1,3 @@ from .core.analyzer import DatasetAnalyzer as DatasetAnalyzer -__version__ = "0.1.0b1" +__version__ = "0.1.0b2"