Skip to content

Commit 25aa2df

Browse files
committed
V2 rewrite (beta): Support Spark Connect
1 parent ca8e9e1 commit 25aa2df

20 files changed

Lines changed: 4713 additions & 156 deletions

README.md

Lines changed: 363 additions & 134 deletions
Large diffs are not rendered by default.

pydeequ/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# ANY KIND, either express or implied. See the License for the specific
1313
# language governing permissions and limitations under the License.
1414
"""Placeholder docstrings"""
15-
__version__ = "1.2.0"
15+
__version__ = "2.0.0b1"
1616

1717
from pyspark.sql import SparkSession
1818

pydeequ/analyzers.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
from pydeequ.repository import MetricsRepository, ResultKey
1111
from enum import Enum
1212
from pydeequ.scala_utils import to_scala_seq
13-
from pydeequ.configs import SPARK_VERSION
1413

1514
class _AnalyzerObject:
1615
"""
@@ -852,4 +851,4 @@ def _create_java_object(self, jvm):
852851
elif self == DataTypeInstances.Fractional:
853852
return dataType_analyzers_class.Fractional()
854853
else:
855-
raise ValueError(f"{jvm} is not a valid datatype Object")
854+
raise ValueError(f"{jvm} is not a valid datatype Object")

pydeequ/checks.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77
from pydeequ.check_functions import is_one
88
from pydeequ.scala_utils import ScalaFunction1, to_scala_seq
9-
from pydeequ.configs import SPARK_VERSION
109

1110
# TODO implement custom assertions
1211
# TODO implement all methods without outside class dependencies

pydeequ/v2/__init__.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
PyDeequ Spark Connect Module.
4+
5+
This module provides Spark Connect compatible implementations of PyDeequ's
6+
data quality verification capabilities. It replaces the Py4J-based bridge
7+
with a protobuf-based communication protocol that works with Spark Connect's
8+
client-server architecture.
9+
10+
Key differences from the legacy Py4J-based PyDeequ:
11+
1. Uses serializable predicates instead of Python lambdas
12+
2. Communicates via protobuf messages over gRPC
13+
3. No direct JVM access required
14+
15+
Example usage:
16+
from pyspark.sql import SparkSession
17+
from pydeequ.v2 import VerificationSuite, Check, CheckLevel
18+
from pydeequ.v2.predicates import gte, eq
19+
20+
# Connect to Spark Connect server
21+
spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate()
22+
23+
# Create a check with constraints
24+
check = (Check(CheckLevel.Error, "Data quality check")
25+
.isComplete("id")
26+
.hasCompleteness("email", gte(0.95))
27+
.hasSize(eq(1000)))
28+
29+
# Run verification
30+
result = (VerificationSuite(spark)
31+
.onData(df)
32+
.addCheck(check)
33+
.run())
34+
35+
# Result is a DataFrame with check results
36+
result.show()
37+
"""
38+
39+
# Import predicates
40+
# Import analyzers
41+
from pydeequ.v2.analyzers import (
42+
ApproxCountDistinct,
43+
ApproxQuantile,
44+
Completeness,
45+
Compliance,
46+
Correlation,
47+
CountDistinct,
48+
DataType,
49+
Distinctness,
50+
Entropy,
51+
Histogram,
52+
Maximum,
53+
MaxLength,
54+
Mean,
55+
Minimum,
56+
MinLength,
57+
MutualInformation,
58+
PatternMatch,
59+
Size,
60+
StandardDeviation,
61+
Sum,
62+
Uniqueness,
63+
UniqueValueRatio,
64+
)
65+
66+
# Import checks
67+
from pydeequ.v2.checks import (
68+
Check,
69+
CheckLevel,
70+
)
71+
from pydeequ.v2.predicates import (
72+
Predicate,
73+
between,
74+
eq,
75+
gt,
76+
gte,
77+
is_non_negative,
78+
is_one,
79+
is_positive,
80+
is_zero,
81+
lt,
82+
lte,
83+
neq,
84+
)
85+
86+
# Import verification
87+
from pydeequ.v2.verification import (
88+
AnalysisRunBuilder,
89+
AnalysisRunner,
90+
AnalyzerContext,
91+
VerificationRunBuilder,
92+
VerificationSuite,
93+
)
94+
95+
__all__ = [
96+
# Predicates
97+
"Predicate",
98+
"eq",
99+
"neq",
100+
"gt",
101+
"gte",
102+
"lt",
103+
"lte",
104+
"between",
105+
"is_one",
106+
"is_zero",
107+
"is_positive",
108+
"is_non_negative",
109+
# Checks
110+
"Check",
111+
"CheckLevel",
112+
# Analyzers
113+
"Size",
114+
"Completeness",
115+
"Mean",
116+
"Sum",
117+
"Maximum",
118+
"Minimum",
119+
"StandardDeviation",
120+
"Distinctness",
121+
"Uniqueness",
122+
"UniqueValueRatio",
123+
"CountDistinct",
124+
"ApproxCountDistinct",
125+
"ApproxQuantile",
126+
"Correlation",
127+
"MutualInformation",
128+
"MaxLength",
129+
"MinLength",
130+
"PatternMatch",
131+
"Compliance",
132+
"Entropy",
133+
"Histogram",
134+
"DataType",
135+
# Verification
136+
"VerificationSuite",
137+
"VerificationRunBuilder",
138+
"AnalysisRunner",
139+
"AnalysisRunBuilder",
140+
"AnalyzerContext",
141+
]

0 commit comments

Comments
 (0)