Fix connect() for Spark Connect and simplify CI workflow

chenliu0831 · claude · chenliu0831 · commit a9be844310b2 · 2026-03-09T09:16:34.000-04:00
- Handle Spark Connect session type in connect() (separate class
  from pyspark.sql.SparkSession)
- Remove manual server start/stop from CI; conftest fixture handles it
- Accept NaN for non-numeric profile stats from Spark

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml
@@ -51,24 +51,9 @@ jobs:
         run: |
           pytest tests/v2/test_unit.py -v
 
-      - name: Start Spark Connect Server
-        run: |
-          $SPARK_HOME/sbin/start-connect-server.sh \
-            --packages org.apache.spark:spark-connect_2.12:3.5.0 \
-            --jars $PWD/deequ_2.12-2.1.0b-spark-3.5.jar \
-            --conf spark.connect.extensions.relation.classes=com.amazon.deequ.connect.DeequRelationPlugin
-          # Wait for server to start
-          sleep 20
-          # Verify server is running
-          ps aux | grep SparkConnectServer | grep -v grep
-
       - name: Run V2 integration tests
         env:
           SPARK_REMOTE: "sc://localhost:15002"
+          DEEQU_JAR: ${{ github.workspace }}/deequ_2.12-2.1.0b-spark-3.5.jar
         run: |
           pytest tests/v2/ -v --ignore=tests/v2/test_unit.py
-
-      - name: Stop Spark Connect Server
-        if: always()
-        run: |
-          $SPARK_HOME/sbin/stop-connect-server.sh || true
diff --git a/pydeequ/engines/__init__.py b/pydeequ/engines/__init__.py
@@ -380,14 +380,21 @@ def connect(
     except ImportError:
         pass
 
-    # Try Spark
+    # Try Spark (regular and Connect sessions are separate classes)
     try:
         from pyspark.sql import SparkSession
         if isinstance(connection, SparkSession):
             from pydeequ.engines.spark import SparkEngine
             return SparkEngine(connection, table=table, dataframe=dataframe)
     except ImportError:
         pass
+    try:
+        from pyspark.sql.connect.session import SparkSession as ConnectSession
+        if isinstance(connection, ConnectSession):
+            from pydeequ.engines.spark import SparkEngine
+            return SparkEngine(connection, table=table, dataframe=dataframe)
+    except ImportError:
+        pass
 
     raise ValueError(
         f"Unsupported connection type: {type(connection).__name__}. "
diff --git a/tests/v2/test_profiles.py b/tests/v2/test_profiles.py
@@ -90,14 +90,16 @@ def test_numeric_statistics(self, engine, profiler_df):
         assert age_profile["std_dev"] is not None
 
     def test_non_numeric_has_null_stats(self, engine, profiler_df):
-        """Test non-numeric columns have null for numeric stats."""
+        """Test non-numeric columns have null/NaN for numeric stats."""
         result = ColumnProfilerRunner(engine).onData(dataframe=profiler_df).run()
         rows = {r["column"]: r for r in result.to_dict('records')}
 
         name_profile = rows["name"]
-        assert name_profile["mean"] is None
-        assert name_profile["minimum"] is None
-        assert name_profile["maximum"] is None
+        # Spark returns NaN for non-numeric stats, DuckDB returns None
+        import math
+        assert name_profile["mean"] is None or math.isnan(name_profile["mean"])
+        assert name_profile["minimum"] is None or math.isnan(name_profile["minimum"])
+        assert name_profile["maximum"] is None or math.isnan(name_profile["maximum"])
 
 
 class TestKLLProfiling: