fix spark benchmark for 100k rows that suffers from a cold start issue #190
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: PyDeequ V2 Tests | |
| on: | |
| push: | |
| branches: | |
| - "**" | |
| pull_request: | |
| branches: | |
| - "master" | |
| jobs: | |
| # V2 tests with Spark Connect (Python 3.12) | |
| v2-tests: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: actions/setup-python@v5 | |
| name: Install Python 3.12 | |
| with: | |
| python-version: "3.12" | |
| - uses: actions/setup-java@v4 | |
| name: Setup Java 17 | |
| with: | |
| distribution: "corretto" | |
| java-version: "17" | |
| - name: Download Spark 3.5 | |
| run: | | |
| curl -L -o spark-3.5.0-bin-hadoop3.tgz \ | |
| https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz | |
| tar -xzf spark-3.5.0-bin-hadoop3.tgz | |
| echo "SPARK_HOME=$PWD/spark-3.5.0-bin-hadoop3" >> $GITHUB_ENV | |
| - name: Download Deequ JAR | |
| run: | | |
| curl -L -o deequ_2.12-2.1.0b-spark-3.5.jar \ | |
| https://github.com/awslabs/python-deequ/releases/download/v2.0.0b1/deequ_2.12-2.1.0b-spark-3.5.jar | |
| - name: Install Python dependencies | |
| run: | | |
| pip install --upgrade pip setuptools | |
| pip install poetry==1.7.1 | |
| poetry install | |
| poetry add "pyspark[connect]==3.5.0" | |
| - name: Run V2 unit tests | |
| run: | | |
| poetry run pytest tests/v2/test_unit.py -v | |
| - name: Start Spark Connect Server | |
| run: | | |
| $SPARK_HOME/sbin/start-connect-server.sh \ | |
| --packages org.apache.spark:spark-connect_2.12:3.5.0 \ | |
| --jars $PWD/deequ_2.12-2.1.0b-spark-3.5.jar \ | |
| --conf spark.connect.extensions.relation.classes=com.amazon.deequ.connect.DeequRelationPlugin | |
| # Wait for server to start | |
| sleep 20 | |
| # Verify server is running | |
| ps aux | grep SparkConnectServer | grep -v grep | |
| - name: Run V2 integration tests | |
| env: | |
| SPARK_REMOTE: "sc://localhost:15002" | |
| run: | | |
| poetry run pytest tests/v2/ -v --ignore=tests/v2/test_unit.py | |
| - name: Stop Spark Connect Server | |
| if: always() | |
| run: | | |
| $SPARK_HOME/sbin/stop-connect-server.sh || true |