Skip to content

fix spark benchmark for 100k rows that suffers from a cold start issue #190

fix spark benchmark for 100k rows that suffers from a cold start issue

fix spark benchmark for 100k rows that suffers from a cold start issue #190

Workflow file for this run

name: PyDeequ V2 Tests
on:
push:
branches:
- "**"
pull_request:
branches:
- "master"
jobs:
# V2 tests with Spark Connect (Python 3.12)
v2-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
name: Install Python 3.12
with:
python-version: "3.12"
- uses: actions/setup-java@v4
name: Setup Java 17
with:
distribution: "corretto"
java-version: "17"
- name: Download Spark 3.5
run: |
curl -L -o spark-3.5.0-bin-hadoop3.tgz \
https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
tar -xzf spark-3.5.0-bin-hadoop3.tgz
echo "SPARK_HOME=$PWD/spark-3.5.0-bin-hadoop3" >> $GITHUB_ENV
- name: Download Deequ JAR
run: |
curl -L -o deequ_2.12-2.1.0b-spark-3.5.jar \
https://github.com/awslabs/python-deequ/releases/download/v2.0.0b1/deequ_2.12-2.1.0b-spark-3.5.jar
- name: Install Python dependencies
run: |
pip install --upgrade pip setuptools
pip install poetry==1.7.1
poetry install
poetry add "pyspark[connect]==3.5.0"
- name: Run V2 unit tests
run: |
poetry run pytest tests/v2/test_unit.py -v
- name: Start Spark Connect Server
run: |
$SPARK_HOME/sbin/start-connect-server.sh \
--packages org.apache.spark:spark-connect_2.12:3.5.0 \
--jars $PWD/deequ_2.12-2.1.0b-spark-3.5.jar \
--conf spark.connect.extensions.relation.classes=com.amazon.deequ.connect.DeequRelationPlugin
# Wait for server to start
sleep 20
# Verify server is running
ps aux | grep SparkConnectServer | grep -v grep
- name: Run V2 integration tests
env:
SPARK_REMOTE: "sc://localhost:15002"
run: |
poetry run pytest tests/v2/ -v --ignore=tests/v2/test_unit.py
- name: Stop Spark Connect Server
if: always()
run: |
$SPARK_HOME/sbin/stop-connect-server.sh || true