diff --git a/python-pointblank/README.md b/python-pointblank/README.md new file mode 100644 index 0000000000..2fae88f76f --- /dev/null +++ b/python-pointblank/README.md @@ -0,0 +1,24 @@ +# Validating Data With Pointblank in Python + +Supporting code and sample data for the Real Python tutorial +"Validating Data With Pointblank in Python". + +## Requirements + +The Python scripts use PEP 723 dependency metadata and run with +[uv](https://docs.astral.sh/uv/): + +```console +$ uv run pointblank_quickstart.py +$ uv run pointblank_thresholds.py +$ uv run pointblank_atoms.py +``` + +The command-line examples can run without a project environment: + +```console +$ uv run --no-project --with 'pointblank[pl]' -- pb scan pointblank_atoms.csv +$ uv run --no-project --with 'pointblank[pl]' -- pb missing pointblank_atoms.csv +$ uvx --from 'pointblank[pl]' pb run pointblank_atoms.yaml --output-html pointblank_report.html +``` + diff --git a/python-pointblank/pointblank_atoms.csv b/python-pointblank/pointblank_atoms.csv new file mode 100644 index 0000000000..f558fa9e69 --- /dev/null +++ b/python-pointblank/pointblank_atoms.csv @@ -0,0 +1,14 @@ +atom_id,symbol,x,y,z,fx,fy,fz +0,Cu,1.0,0.5,0.1,0.1,0.0,0.0 +1,Pt,2.1,1.5,0.2,-0.2,0.1,-0.1 +2,Cu,3.2,2.5,0.3,0.3,-0.1,0.1 +3,Pt,4.3,3.5,0.4,-0.1,0.0,0.0 +4,Cu,5.4,4.5,0.5,0.2,0.1,-0.1 +5,Pt,6.5,5.5,0.6,-0.3,-0.1,0.1 +6,Cu,7.6,6.5,0.7,0.1,0.0,0.0 +7,Pt,8.7,7.5,0.8,-0.2,0.1,-0.1 +8,Cu,9.8,8.5,0.9,0.3,-0.1,0.1 +9,Pt,10.9,9.5,1.0,-0.1,0.0,0.0 +10,Zz,0.5,0.5,0.1,0.0,0.0,0.0 +11,Cu,,1.5,0.2,0.0,0.0,0.0 +12,Pt,12.1,2.5,0.3,1500.0,0.0,0.0 diff --git a/python-pointblank/pointblank_atoms.py b/python-pointblank/pointblank_atoms.py new file mode 100644 index 0000000000..a1bec97dba --- /dev/null +++ b/python-pointblank/pointblank_atoms.py @@ -0,0 +1,43 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "pointblank[pl]", +# ] +# /// + +import polars as pl +import pointblank as pb + +VALID_ELEMENTS = ["Cu", "Pt"] + + +def main() -> None: + atoms = pl.read_csv("pointblank_atoms.csv") + + validation = ( + pb.Validate( + data=atoms, + tbl_name="atoms_from_parser", + label="Round-trip validation before re-export", + thresholds=pb.Thresholds(warning=0.02, error=0.05, critical=0.07), + ) + .col_vals_in_set(columns="symbol", set=VALID_ELEMENTS) + .col_vals_not_null(columns=["x", "y", "z"]) + .col_vals_between(columns=["x", "y", "z"], left=0, right=20) + .col_vals_between(columns="fx", left=-1000, right=1000) + .interrogate() + ) + + clean = validation.get_sundered_data(type="pass") + dirty = validation.get_sundered_data(type="fail") + + print(f"Safe to re-export: {len(clean)} rows") + print(f"Needs review: {len(dirty)} rows") + print("\nClean rows") + print(clean.select(["atom_id", "symbol", "x", "fx"])) + print("\nDirty rows") + print(dirty.select(["atom_id", "symbol", "x", "fx"])) + + +if __name__ == "__main__": + main() diff --git a/python-pointblank/pointblank_atoms.yaml b/python-pointblank/pointblank_atoms.yaml new file mode 100644 index 0000000000..edeac47fa8 --- /dev/null +++ b/python-pointblank/pointblank_atoms.yaml @@ -0,0 +1,22 @@ +tbl: pointblank_atoms.csv +df_library: polars +tbl_name: "Atom Validation" +label: "Tutorial YAML validation" +thresholds: + warning: 0.02 + error: 0.05 + critical: 0.07 +steps: + - col_vals_in_set: + columns: symbol + set: [Cu, Pt] + - col_vals_not_null: + columns: [x, y, z] + - col_vals_between: + columns: [x, y, z] + left: 0 + right: 20 + - col_vals_between: + columns: fx + left: -1000 + right: 1000 diff --git a/python-pointblank/pointblank_quickstart.py b/python-pointblank/pointblank_quickstart.py new file mode 100644 index 0000000000..e51b4875eb --- /dev/null +++ b/python-pointblank/pointblank_quickstart.py @@ -0,0 +1,39 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "pointblank[pl]", +# ] +# /// + +import pointblank as pb + + +def main() -> None: + validation = ( + pb.Validate( + data=pb.load_dataset("small_table", tbl_type="polars"), + tbl_name="small_table", + label="Quickstart validation", + ) + .col_vals_between(columns="d", left=0, right=5000) + .col_vals_in_set(columns="f", set=["low", "mid", "high"]) + .col_vals_not_null(columns="c") + .interrogate() + ) + + report = validation.get_dataframe_report() + summary = report.select( + ["step_description", "pass_n", "failed_n"] + ).iter_rows(named=True) + + print("Validation summary:\n") + for step in summary: + print( + f"{step['step_description']:20}" + f"passed={step['pass_n']:<4}" + f"failed={step['failed_n']}" + ) + + +if __name__ == "__main__": + main() diff --git a/python-pointblank/pointblank_report.html b/python-pointblank/pointblank_report.html new file mode 100644 index 0000000000..6dae8b9841 --- /dev/null +++ b/python-pointblank/pointblank_report.html @@ -0,0 +1,443 @@ +
+ + ++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Pointblank Validation
Tutorial YAML validation
PolarsAtom ValidationWARNING0.02ERROR0.05CRITICAL0.07
STEPCOLUMNSVALUESTBLEVALUNITSPASSFAILWECEXT
#FF33001 +
+ + + col_vals_in_set + + + + + + + +
+
+
col_vals_in_set()
+
+ +
symbolCu, Pt + + + + + + + +1312
0.92
1
0.08
#FF33002 +
+ + + col_vals_not_null + + + + + + + + + +
+
+
col_vals_not_null()
+
+ +
x + + + + + + + +1312
0.92
1
0.08
#4CA64C3 +
+ + + col_vals_not_null + + + + + + + + + +
+
+
col_vals_not_null()
+
+ +
y + + + + + + + +1313
1.00
0
0.00
#4CA64C4 +
+ + + col_vals_not_null + + + + + + + + + +
+
+
col_vals_not_null()
+
+ +
z + + + + + + + +1313
1.00
0
0.00
#FF33005 +
+ + + col_vals_between + + + + + + + +
+
+
col_vals_between()
+
+ +
x[0, 20] + + + + + + + +1312
0.92
1
0.08
#4CA64C6 +
+ + + col_vals_between + + + + + + + +
+
+
col_vals_between()
+
+ +
y[0, 20] + + + + + + + +1313
1.00
0
0.00
#4CA64C7 +
+ + + col_vals_between + + + + + + + +
+
+
col_vals_between()
+
+ +
z[0, 20] + + + + + + + +1313
1.00
0
0.00
#FF33008 +
+ + + col_vals_between + + + + + + + +
+
+
col_vals_between()
+
+ +
fx[-1000, 1000] + + + + + + + +1312
0.92
1
0.08
2026-05-06 13:39:39 UTC< 1 s2026-05-06 13:39:39 UTC
+ +
diff --git a/python-pointblank/pointblank_report.png b/python-pointblank/pointblank_report.png new file mode 100644 index 0000000000..d3d2d26824 Binary files /dev/null and b/python-pointblank/pointblank_report.png differ diff --git a/python-pointblank/pointblank_starter_validation.yaml b/python-pointblank/pointblank_starter_validation.yaml new file mode 100644 index 0000000000..4a5f405b2c --- /dev/null +++ b/python-pointblank/pointblank_starter_validation.yaml @@ -0,0 +1,24 @@ +# Starter Pointblank template for adapting to your own pipeline. +# +# You can run this template against a real file with: +# uv run --no-project --with 'pointblank[pl]' pb run pointblank_starter_validation.yaml --data your_data.csv --fail-on critical + +tbl: small_table +df_library: polars +tbl_name: "Starter Validation" +label: "Adapt this template to your data" +thresholds: + warning: 0.02 + error: 0.05 + critical: 0.10 +steps: + - col_exists: + columns: [record_id, status, amount] + - col_vals_not_null: + columns: record_id + - col_vals_in_set: + columns: status + set: [pending, shipped, delivered] + - col_vals_gt: + columns: amount + value: 0 diff --git a/python-pointblank/pointblank_thresholds.py b/python-pointblank/pointblank_thresholds.py new file mode 100644 index 0000000000..7e9b8d99c0 --- /dev/null +++ b/python-pointblank/pointblank_thresholds.py @@ -0,0 +1,48 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "pointblank[pl]", +# ] +# /// + +import pointblank as pb + + +def main() -> None: + validation = ( + pb.Validate( + data=pb.load_dataset("small_table", tbl_type="polars"), + tbl_name="small_table", + label="Threshold-driven validation", + thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15), + actions=pb.Actions( + warning=( + "Warning: step {step} reached {level} severity during " + "{type}." + ), + critical=( + "Critical: step {step} reached {level} severity during " + "{type}." + ), + ), + ) + .col_vals_between(columns="d", left=0, right=5000) + .col_vals_not_null(columns="c") + .rows_distinct() + .interrogate() + ) + + print("All checks passed perfectly:", validation.all_passed()) + print( + "Anything above the error threshold:", + validation.above_threshold(level="error"), + ) + + try: + validation.assert_below_threshold(level="critical") + except AssertionError as exc: + print("CI gate tripped:", exc) + + +if __name__ == "__main__": + main()