awslabs
diff --git a/‎README.md‎
Lines changed: 363 additions & 134 deletions b/‎README.md‎
Lines changed: 363 additions & 134 deletions
diff --git a/‎pydeequ/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎pydeequ/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pydeequ/analyzers.py‎
Lines changed: 1 addition & 2 deletions b/‎pydeequ/analyzers.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎pydeequ/checks.py‎
Lines changed: 0 additions & 1 deletion b/‎pydeequ/checks.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎pydeequ/v2/__init__.py‎
Lines changed: 141 additions & 0 deletions b/‎pydeequ/v2/__init__.py‎
Lines changed: 141 additions & 0 deletions
@@ -12,7 +12,7 @@
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
 """Placeholder docstrings"""
-__version__ = "1.2.0"
+__version__ = "2.0.0b1"
 
 from pyspark.sql import SparkSession
 
 
@@ -10,7 +10,6 @@
 from pydeequ.repository import MetricsRepository, ResultKey
 from enum import Enum
 from pydeequ.scala_utils import to_scala_seq
-from pydeequ.configs import SPARK_VERSION
 
 class _AnalyzerObject:
     """
@@ -852,4 +851,4 @@ def _create_java_object(self, jvm):
         elif self == DataTypeInstances.Fractional:
             return dataType_analyzers_class.Fractional()
         else:
-            raise ValueError(f"{jvm} is not a valid datatype Object")
+            raise ValueError(f"{jvm} is not a valid datatype Object")
@@ -6,7 +6,6 @@
 
 from pydeequ.check_functions import is_one
 from pydeequ.scala_utils import ScalaFunction1, to_scala_seq
-from pydeequ.configs import SPARK_VERSION
 
 # TODO implement custom assertions
 # TODO implement all methods without outside class dependencies
 
@@ -0,0 +1,141 @@
+# -*- coding: utf-8 -*-
+"""
+PyDeequ Spark Connect Module.
+
+This module provides Spark Connect compatible implementations of PyDeequ's
+data quality verification capabilities. It replaces the Py4J-based bridge
+with a protobuf-based communication protocol that works with Spark Connect's
+client-server architecture.
+
+Key differences from the legacy Py4J-based PyDeequ:
+1. Uses serializable predicates instead of Python lambdas
+2. Communicates via protobuf messages over gRPC
+3. No direct JVM access required
+
+Example usage:
+    from pyspark.sql import SparkSession
+    from pydeequ.v2 import VerificationSuite, Check, CheckLevel
+    from pydeequ.v2.predicates import gte, eq
+
+    # Connect to Spark Connect server
+    spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate()
+
+    # Create a check with constraints
+    check = (Check(CheckLevel.Error, "Data quality check")
+        .isComplete("id")
+        .hasCompleteness("email", gte(0.95))
+        .hasSize(eq(1000)))
+
+    # Run verification
+    result = (VerificationSuite(spark)
+        .onData(df)
+        .addCheck(check)
+        .run())
+
+    # Result is a DataFrame with check results
+    result.show()
+"""
+
+# Import predicates
+# Import analyzers
+from pydeequ.v2.analyzers import (
+    ApproxCountDistinct,
+    ApproxQuantile,
+    Completeness,
+    Compliance,
+    Correlation,
+    CountDistinct,
+    DataType,
+    Distinctness,
+    Entropy,
+    Histogram,
+    Maximum,
+    MaxLength,
+    Mean,
+    Minimum,
+    MinLength,
+    MutualInformation,
+    PatternMatch,
+    Size,
+    StandardDeviation,
+    Sum,
+    Uniqueness,
+    UniqueValueRatio,
+)
+
+# Import checks
+from pydeequ.v2.checks import (
+    Check,
+    CheckLevel,
+)
+from pydeequ.v2.predicates import (
+    Predicate,
+    between,
+    eq,
+    gt,
+    gte,
+    is_non_negative,
+    is_one,
+    is_positive,
+    is_zero,
+    lt,
+    lte,
+    neq,
+)
+
+# Import verification
+from pydeequ.v2.verification import (
+    AnalysisRunBuilder,
+    AnalysisRunner,
+    AnalyzerContext,
+    VerificationRunBuilder,
+    VerificationSuite,
+)
+
+__all__ = [
+    # Predicates
+    "Predicate",
+    "eq",
+    "neq",
+    "gt",
+    "gte",
+    "lt",
+    "lte",
+    "between",
+    "is_one",
+    "is_zero",
+    "is_positive",
+    "is_non_negative",
+    # Checks
+    "Check",
+    "CheckLevel",
+    # Analyzers
+    "Size",
+    "Completeness",
+    "Mean",
+    "Sum",
+    "Maximum",
+    "Minimum",
+    "StandardDeviation",
+    "Distinctness",
+    "Uniqueness",
+    "UniqueValueRatio",
+    "CountDistinct",
+    "ApproxCountDistinct",
+    "ApproxQuantile",
+    "Correlation",
+    "MutualInformation",
+    "MaxLength",
+    "MinLength",
+    "PatternMatch",
+    "Compliance",
+    "Entropy",
+    "Histogram",
+    "DataType",
+    # Verification
+    "VerificationSuite",
+    "VerificationRunBuilder",
+    "AnalysisRunner",
+    "AnalysisRunBuilder",
+    "AnalyzerContext",
+]