Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 20 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@
<p align="center">
<!-- Distribution -->
<img src="https://img.shields.io/pypi/v/hashprep?color=blue&label=PyPI" />
<!-- <img src="https://img.shields.io/badge/PyPI-Coming%20Soon-blue" /> -->
<!-- License -->
<img src="https://img.shields.io/badge/License-MIT-green" />
<img src="https://img.shields.io/badge/CLI-Supported-orange" />
<!-- CI -->
<a href="https://github.com/cachevector/hashprep/actions/workflows/ci.yml"><img src="https://github.com/cachevector/hashprep/actions/workflows/ci.yml/badge.svg" /></a>
</p>
<p>
<!-- Features -->
Expand All @@ -26,7 +27,7 @@
</div>

> [!NOTE]
> HashPrep is now in **beta** (v0.1.0b1). Core features are stable and tested, but the API may still evolve based on community feedback. Ready for testing in real-world ML workflows.
> HashPrep is in **beta** (v0.1.0b2). Core features are fully tested with CI. The API may still evolve based on community feedback.

## Overview

Expand Down Expand Up @@ -165,19 +166,31 @@ hashprep version
```

#### Available Checks
- `outliers` - Detect outliers using IQR method
- `outliers` - Detect outliers using z-score
- `duplicates` - Find duplicate rows
- `high_missing_values` - Columns with >50% missing data
- `high_missing_values` - Columns with high missing data
- `empty_columns` - Completely empty columns
- `dataset_missingness` - Overall missing data patterns
- `missing_patterns` - Correlated missing value patterns
- `high_cardinality` - Categorical columns with too many unique values
- `single_value_columns` - Constant columns with no variance
- `mixed_data_types` - Columns with mixed data types
- `class_imbalance` - Imbalanced target variable (requires --target)
- `feature_correlation` - Highly correlated features
- `target_leakage` - Features that may leak target information
- `feature_correlation` - Highly correlated numeric features
- `categorical_correlation` - Highly associated categorical features
- `mixed_correlation` - Numeric-categorical associations
- `data_leakage` - Columns identical to target
- `target_leakage_patterns` - Features that may leak target information
- `dataset_drift` - Distribution drift between datasets (requires --comparison)
- `uniform_distribution` - Uniformly distributed numeric columns
- `unique_values` - Columns where >95% values are unique
- `many_zeros` - Columns with excessive zero values
- `high_zero_counts` - Columns with excessive zero values
- `skewness` - Highly skewed numeric distributions
- `infinite_values` - Columns containing infinite values
- `constant_length` - String columns with constant character length
- `extreme_text_lengths` - Text columns with extreme value lengths
- `datetime_skew` - Datetime columns concentrated in one period
- `empty_dataset` - Empty or all-missing datasets

---

Expand Down
2 changes: 1 addition & 1 deletion hashprep/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .core.analyzer import DatasetAnalyzer as DatasetAnalyzer

__version__ = "0.1.0b1"
__version__ = "0.1.0b2"
28 changes: 13 additions & 15 deletions hashprep/checks/leakage.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,16 @@
_LEAK = DEFAULT_CONFIG.leakage
_log = get_logger("checks.leakage")

_LEAKAGE_CRITICAL_FIX = (
"Options: \n- Drop column: Prevents target leakage (Pros: Ensures model integrity; Cons: Loses feature info)."
"\n- Verify feature: Check if correlation is valid or data-derived (Pros: Validates data; Cons: Time-consuming)."
)
_LEAKAGE_WARNING_FIX = (
"Options: \n- Drop column: Reduces leakage risk (Pros: Safer model; Cons: May lose predictive info)."
"\n- Retain and test: Use robust models (e.g., trees) and evaluate (Pros: Keeps potential signal; Cons: Risk of overfitting)."
"\n- Engineer feature: Transform to reduce correlation (Pros: Retains info; Cons: Adds complexity)."
)


def _check_data_leakage(analyzer):
issues = []
Expand Down Expand Up @@ -52,11 +62,7 @@ def _check_target_leakage_patterns(analyzer):
)
if severity:
impact = "high" if severity == "critical" else "medium"
quick_fix = (
"Options: \n- Drop column: Prevents target leakage (Pros: Ensures model integrity; Cons: Loses feature info).\n- Verify feature: Check if correlation is valid or data-derived (Pros: Validates data; Cons: Time-consuming)."
if severity == "critical"
else "Options: \n- Drop column: Reduces leakage risk (Pros: Safer model; Cons: May lose predictive info).\n- Retain and test: Use robust models (e.g., trees) and evaluate (Pros: Keeps potential signal; Cons: Risk of overfitting).\n- Engineer feature: Transform to reduce correlation (Pros: Retains info; Cons: Adds complexity)."
)
quick_fix = _LEAKAGE_CRITICAL_FIX if severity == "critical" else _LEAKAGE_WARNING_FIX
issues.append(
Issue(
category="target_leakage",
Expand Down Expand Up @@ -87,11 +93,7 @@ def _check_target_leakage_patterns(analyzer):
)
if severity:
impact = "high" if severity == "critical" else "medium"
quick_fix = (
"Options: \n- Drop column: Prevents target leakage (Pros: Ensures model integrity; Cons: Loses feature info).\n- Verify feature: Check if correlation is valid or data-derived (Pros: Validates data; Cons: Time-consuming)."
if severity == "critical"
else "Options: \n- Drop column: Reduces leakage risk (Pros: Safer model; Cons: May lose predictive info).\n- Retain and test: Use robust models (e.g., trees) and evaluate (Pros: Keeps potential signal; Cons: Risk of overfitting).\n- Engineer feature: Transform to reduce correlation (Pros: Retains info; Cons: Adds complexity)."
)
quick_fix = _LEAKAGE_CRITICAL_FIX if severity == "critical" else _LEAKAGE_WARNING_FIX
issues.append(
Issue(
category="target_leakage",
Expand Down Expand Up @@ -127,11 +129,7 @@ def _check_target_leakage_patterns(analyzer):
)
if severity:
impact = "high" if severity == "critical" else "medium"
quick_fix = (
"Options: \n- Drop column: Prevents target leakage (Pros: Ensures model integrity; Cons: Loses feature info).\n- Verify feature: Check if correlation is valid or data-derived (Pros: Validates data; Cons: Time-consuming)."
if severity == "critical"
else "Options: \n- Drop column: Reduces leakage risk (Pros: Safer model; Cons: May lose predictive info).\n- Retain and test: Use robust models (e.g., trees) and evaluate (Pros: Keeps potential signal; Cons: Risk of overfitting).\n- Engineer feature: Transform to reduce correlation (Pros: Retains info; Cons: Adds complexity)."
)
quick_fix = _LEAKAGE_CRITICAL_FIX if severity == "critical" else _LEAKAGE_WARNING_FIX
issues.append(
Issue(
category="target_leakage",
Expand Down
42 changes: 42 additions & 0 deletions hashprep/reports/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""Base report class with shared logic for all report renderers."""


class BaseReport:
ALERT_TYPE_MAPPING = {
"feature_correlation": "High Correlation",
"categorical_correlation": "High Correlation",
"mixed_correlation": "High Correlation",
"missing_values": "Missing",
"high_missing_values": "Missing",
"dataset_missingness": "Missing",
"missing_patterns": "Missing",
"uniform_distribution": "Uniform",
"unique_values": "Unique",
"high_zero_counts": "Zeros",
"outliers": "Outliers",
"skewness": "Skewness",
"high_cardinality": "High Cardinality",
"duplicates": "Duplicates",
"data_leakage": "Leakage",
"target_leakage_patterns": "Leakage",
"class_imbalance": "Imbalance",
"empty_columns": "Empty",
"single_value_columns": "Constant",
"mixed_data_types": "Mixed Types",
"extreme_text_lengths": "Text Length",
"datetime_skew": "DateTime Skew",
"dataset_drift": "Drift",
"infinite_values": "Infinite",
"constant_length": "Constant Length",
"empty_dataset": "Empty Dataset",
}

def _group_alerts_by_type(self, issues: list[dict]) -> dict[str, list[dict]]:
"""Group issues into display categories for the alerts section."""
groups: dict[str, list[dict]] = {}
for issue in issues:
alert_type = self.ALERT_TYPE_MAPPING.get(issue["category"], "Other")
if alert_type not in groups:
groups[alert_type] = []
groups[alert_type].append(issue)
return groups
43 changes: 2 additions & 41 deletions hashprep/reports/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,37 +7,10 @@

import hashprep

from .base import BaseReport

class HtmlReport:
ALERT_TYPE_MAPPING = {
"feature_correlation": "High Correlation",
"categorical_correlation": "High Correlation",
"mixed_correlation": "High Correlation",
"missing_values": "Missing",
"high_missing_values": "Missing",
"dataset_missingness": "Missing",
"missing_patterns": "Missing",
"uniform_distribution": "Uniform",
"unique_values": "Unique",
"high_zero_counts": "Zeros",
"outliers": "Outliers",
"skewness": "Skewness",
"high_cardinality": "High Cardinality",
"duplicates": "Duplicates",
"data_leakage": "Leakage",
"target_leakage_patterns": "Leakage",
"class_imbalance": "Imbalance",
"empty_columns": "Empty",
"single_value_columns": "Constant",
"mixed_data_types": "Mixed Types",
"extreme_text_lengths": "Text Length",
"datetime_skew": "DateTime Skew",
"dataset_drift": "Drift",
"infinite_values": "Infinite",
"constant_length": "Constant Length",
"empty_dataset": "Empty Dataset",
}

class HtmlReport(BaseReport):
def generate(self, summary, full=False, output_file=None, theme="minimal", pdf_mode=False):
template_str = self._get_template(theme)
template = Template(template_str)
Expand Down Expand Up @@ -101,18 +74,6 @@ def generate(self, summary, full=False, output_file=None, theme="minimal", pdf_m
f.write(html_content)
return html_content

def _group_alerts_by_type(self, issues: list[dict]) -> dict[str, list[dict]]:
"""Group issues into display categories for the alerts section."""
groups: dict[str, list[dict]] = {}

for issue in issues:
alert_type = self.ALERT_TYPE_MAPPING.get(issue["category"], "Other")
if alert_type not in groups:
groups[alert_type] = []
groups[alert_type].append(issue)

return groups

def _generate_config(self, summary) -> dict:
"""Generate configuration dict for download."""
reproduction_info = summary["summaries"].get("reproduction_info", {})
Expand Down
42 changes: 2 additions & 40 deletions hashprep/reports/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,40 +6,12 @@
import hashprep

from ..utils.logging import get_logger
from .base import BaseReport

_log = get_logger("reports.markdown")


class MarkdownReport:
ALERT_TYPE_MAPPING = {
"feature_correlation": "High Correlation",
"categorical_correlation": "High Correlation",
"mixed_correlation": "High Correlation",
"missing_values": "Missing",
"high_missing_values": "Missing",
"dataset_missingness": "Missing",
"missing_patterns": "Missing",
"uniform_distribution": "Uniform",
"unique_values": "Unique",
"high_zero_counts": "Zeros",
"outliers": "Outliers",
"skewness": "Skewness",
"high_cardinality": "High Cardinality",
"duplicates": "Duplicates",
"data_leakage": "Leakage",
"target_leakage_patterns": "Leakage",
"class_imbalance": "Imbalance",
"empty_columns": "Empty",
"single_value_columns": "Constant",
"mixed_data_types": "Mixed Types",
"extreme_text_lengths": "Text Length",
"datetime_skew": "DateTime Skew",
"dataset_drift": "Drift",
"infinite_values": "Infinite",
"constant_length": "Constant Length",
"empty_dataset": "Empty Dataset",
}

class MarkdownReport(BaseReport):
def generate(self, summary, full=False, output_file=None):
dataset_info = summary["summaries"]["dataset_info"]
reproduction_info = summary["summaries"].get("reproduction_info", {})
Expand Down Expand Up @@ -275,13 +247,3 @@ def generate(self, summary, full=False, output_file=None):
with open(output_file, "w") as f:
f.write(content)
return content

def _group_alerts_by_type(self, issues: list[dict]) -> dict[str, list[dict]]:
"""Group issues into display categories."""
groups: dict[str, list[dict]] = {}
for issue in issues:
alert_type = self.ALERT_TYPE_MAPPING.get(issue["category"], "Other")
if alert_type not in groups:
groups[alert_type] = []
groups[alert_type].append(issue)
return groups
40 changes: 2 additions & 38 deletions hashprep/reports/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,37 +6,10 @@

import hashprep

from .base import BaseReport

class PdfReport:
ALERT_TYPE_MAPPING = {
"feature_correlation": "High Correlation",
"categorical_correlation": "High Correlation",
"mixed_correlation": "High Correlation",
"missing_values": "Missing",
"high_missing_values": "Missing",
"dataset_missingness": "Missing",
"missing_patterns": "Missing",
"uniform_distribution": "Uniform",
"unique_values": "Unique",
"high_zero_counts": "Zeros",
"outliers": "Outliers",
"skewness": "Skewness",
"high_cardinality": "High Cardinality",
"duplicates": "Duplicates",
"data_leakage": "Leakage",
"target_leakage_patterns": "Leakage",
"class_imbalance": "Imbalance",
"empty_columns": "Empty",
"single_value_columns": "Constant",
"mixed_data_types": "Mixed Types",
"extreme_text_lengths": "Text Length",
"datetime_skew": "DateTime Skew",
"dataset_drift": "Drift",
"infinite_values": "Infinite",
"constant_length": "Constant Length",
"empty_dataset": "Empty Dataset",
}

class PdfReport(BaseReport):
def generate(self, summary, full=False, output_file=None, **kwargs):
template = Template(self._get_template())

Expand Down Expand Up @@ -87,15 +60,6 @@ def generate(self, summary, full=False, output_file=None, **kwargs):
f.write(pdf_content)
return pdf_content

def _group_alerts_by_type(self, issues: list[dict]) -> dict[str, list[dict]]:
groups: dict[str, list[dict]] = {}
for issue in issues:
alert_type = self.ALERT_TYPE_MAPPING.get(issue["category"], "Other")
if alert_type not in groups:
groups[alert_type] = []
groups[alert_type].append(issue)
return groups

def _get_template(self):
return """<!DOCTYPE html>
<html>
Expand Down
Loading