diff --git a/docker/build.sh b/docker/build.sh index 7cf3d95a8ce..77c8a54767a 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -81,6 +81,17 @@ if [ "$SEDONA_VERSION" = "latest" ]; then # The compilation must take place outside Docker to avoid unnecessary maven packages mvn clean install -DskipTests -Dspark="${SEDONA_SPARK_VERSION}" -Dscala=2.13 +else + # When building against a published Sedona version, install-sedona.sh + # downloads the shaded JAR from Maven Central inside the container and + # never reads spark-shaded/target/. Any stale Maven artifacts in the + # local tree would still be pulled into the build context by the + # `COPY ./spark-shaded/` step, ship in the published manifest, and add + # to every pull's download size — even though the dockerfile's trailing + # `RUN rm -rf ${SEDONA_HOME}` deletes them from the runtime filesystem. + # apache/sedona:1.9.0 hit this regression and shipped 1.1 GB heavier than + # 1.8.1 (4.03 GB vs 2.97 GB compressed) for exactly this reason. + rm -rf spark-shaded/target python/build python/dist fi # -- Building the image diff --git a/docker/sedona-docker.dockerfile b/docker/sedona-docker.dockerfile index b12c98d5104..4156fa36c3b 100644 --- a/docker/sedona-docker.dockerfile +++ b/docker/sedona-docker.dockerfile @@ -43,14 +43,14 @@ ENV PYTHONPATH=${SPARK_HOME}/python # Set up OS libraries and PySpark RUN apt-get update RUN apt-get install -y openjdk-17-jdk-headless curl python3-pip maven -RUN pip3 install pipenv --break-system-packages +RUN pip3 install --no-cache-dir pipenv --break-system-packages COPY ./docker/install-spark.sh ${SEDONA_HOME}/docker/ RUN chmod +x ${SEDONA_HOME}/docker/install-spark.sh RUN ${SEDONA_HOME}/docker/install-spark.sh ${spark_version} ${hadoop_s3_version} ${aws_sdk_version} # Install Python dependencies COPY docker/requirements.txt /opt/requirements.txt -RUN pip3 install -r /opt/requirements.txt --break-system-packages +RUN pip3 install --no-cache-dir -r /opt/requirements.txt --break-system-packages # Copy local compiled jars and python code to the docker environment diff --git a/docker/sedona-docker.dockerfile.dockerignore b/docker/sedona-docker.dockerfile.dockerignore index 3699b015a2b..0b5ea0bd6ac 100644 --- a/docker/sedona-docker.dockerfile.dockerignore +++ b/docker/sedona-docker.dockerfile.dockerignore @@ -7,3 +7,16 @@ !docs/usecases/** !python/** !spark-shaded/** + +# Re-exclude Python build artifacts and pyc caches so a tree that has +# had `python -m build` run against it does not balloon the COPY layers. +# Note: `**/target/` is intentionally NOT excluded here because the +# build.sh `latest` mode runs `mvn clean install -DskipTests` and then +# install-sedona.sh inside the image copies the freshly-built shaded +# JAR from `${SEDONA_HOME}/spark-shaded/target/`. Stale Maven `target/` +# directories from a prior local build are cleaned up by build.sh +# (non-latest branch) instead — see docker/build.sh. +python/build/ +python/dist/ +**/*.egg-info/ +**/__pycache__/