diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..4894e31 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,5 @@ +# GDAL platform tarball — ~250 MB, slowly-changing (bumped only when the +# UbuntuGIS-PPA-resolved GDAL/PROJ/GEOS set changes). Stored via Git LFS so +# clones stay light; LFS pulls the bytes on demand. See +# resources/static/README.md for how to rebuild it. +resources/static/geobrix-gdal-platform-noble.tar.gz filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/package-geobrix-artifacts.yml b/.github/workflows/package-geobrix-artifacts.yml new file mode 100644 index 0000000..bba4150 --- /dev/null +++ b/.github/workflows/package-geobrix-artifacts.yml @@ -0,0 +1,305 @@ +name: package geobrix artifacts +# One-stop release packaging: builds the JAR + Python wheel inline, +# repackages the committed GDAL platform tarball with the JAR baked in, and +# attaches every release artifact (JAR, wheel, GDAL tarball, sidecar, init +# script, docs zip) to an existing tag via `gh release upload --clobber`. +# +# Manual-only: Actions -> "package geobrix artifacts" -> Run workflow. +# +# Inputs: +# ref - git ref to build from (branch / tag / SHA). Empty = the +# ref the workflow was launched on. +# attach_to_tag - tag (e.g. v0.3.0) to attach all six files to. Empty = +# produce workflow artifacts only, no tag mutation. +# +# Always uploaded as workflow artifacts (downloadable from the run page): +# 1. geobrix--jar-with-dependencies.jar (built inline) +# 2. dblabs_geobrix--py3-none-any.whl (built inline) +# 3. geobrix-gdal-artifacts-v-noble.tar.gz (repackaged) +# 4. geobrix-gdal-artifacts-v-noble.tar.gz.sha256 (computed) +# 5. geobrix-gdal-init.sh (committed) +# 6. geobrix-docs-.zip (built inline from docs/) +# +# The slow PPA/apt dance does NOT run here - the GDAL platform layer +# (~250 MB of .debs + wheels + JNI) is committed under resources/static/ +# (Git LFS) and was reviewed at the PR that bumped it. This workflow only +# grafts the per-release JAR into a copy of those bytes and recomputes the +# SHA256SUMS manifest. Total runtime ~2-3 min vs. ~15 if we rebuilt the +# platform layer per release. +# +# Security: every workflow_dispatch input is surfaced as env: before any +# run: block. Direct ${{ ... }} interpolation of user inputs into shell is +# a command-injection risk - we don't do it. See: +# https://github.blog/security/vulnerability-research/how-to-catch-github-actions-workflow-injections-before-attackers-do/ +# +# All jobs run on the Databricks-hardened runner group (Labs lockdown policy). +on: + workflow_dispatch: + inputs: + ref: + description: "Git ref (branch / tag / SHA) to build from. Empty = the ref the workflow was launched on." + required: false + type: string + default: "" + attach_to_tag: + description: "Tag to attach the six files to (e.g. v0.3.0). Empty = workflow artifacts only." + required: false + type: string + default: "" + +permissions: + contents: read + +jobs: + package: + runs-on: + group: databrickslabs-protected-runner-group + labels: linux-ubuntu-latest + environment: runtime + permissions: + contents: write + id-token: write + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + INPUT_REF: ${{ inputs.ref }} + INPUT_ATTACH_TO_TAG: ${{ inputs.attach_to_tag }} + strategy: + matrix: + python: [ 3.12.3 ] + numpy: [ 2.1.3 ] + gdal: [ 3.11.4 ] + spark: [ 4.0.0 ] + steps: + - name: checkout code (with LFS) + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + ref: ${{ inputs.ref || github.ref }} + token: ${{ secrets.REPO_ACCESS_TOKEN || secrets.GITHUB_TOKEN }} + lfs: true + + - name: verify platform tarball is LFS-pulled + shell: bash + run: | + PLATFORM=resources/static/geobrix-gdal-platform-noble.tar.gz + if [ ! -s "$PLATFORM" ]; then + echo "$PLATFORM is missing or empty" >&2 + exit 1 + fi + if head -c 50 "$PLATFORM" | grep -q '^version https://git-lfs'; then + echo "$PLATFORM is an LFS pointer, not the binary - checkout's lfs: true didn't resolve it." >&2 + exit 1 + fi + ( cd resources/static && sha256sum -c geobrix-gdal-platform-noble.tar.gz.sha256 ) + + - name: Configure JDK + uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 + with: + java-version: "17" + distribution: "zulu" + cache: "maven" + cache-dependency-path: "pom.xml" + + - name: Set Maven opts + shell: bash + run: echo "MAVEN_OPTS=-Xmx4g -XX:+UseG1GC" >> "$GITHUB_ENV" + + - name: Create pip cache key file + shell: bash + env: + GH_REF: ${{ github.ref }} + PY: ${{ matrix.python }} + NP: ${{ matrix.numpy }} + SP: ${{ matrix.spark }} + GD: ${{ matrix.gdal }} + run: | + echo "${GH_REF}-${PY}-${NP}-${SP}-${GD}" > .ci-pip-cache-key + + - name: Pre-bootstrap pip for JFrog + uses: ./.github/actions/jfrog-pip-bootstrap + + - name: Configure Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + cache: "pip" + cache-dependency-path: ".ci-pip-cache-key" + python-version: ${{ matrix.python }} + + - name: Authenticate for JFrog + uses: ./.github/actions/jfrog-auth + + - name: Verify Maven dependency PGP signatures + shell: bash + run: ./scripts/security/maven-pgp-verify + + - name: build scala JAR (skip tests, no GDAL install) + shell: bash + run: | + mvn -C -q clean package -DskipTests -Dscoverage.skip -Dscalastyle.fail=false + ls -lh target/geobrix-*-jar-with-dependencies.jar + + # Hash-pinned minimal build set: just build / setuptools / wheel. See + # python/geobrix/requirements-build.in for the source list; the .txt + # lockfile must be regenerated via `uv pip compile --generate-hashes` + # when bumping any of those. Slimmer than requirements-ci.txt (which + # pulls pytest, black, scientific stack) by ~30s of pip resolve time. + - name: install Python build deps (hash-pinned) + shell: bash + run: | + pip install --upgrade pip==25.0.1 + pip install --require-hashes -r python/geobrix/requirements-build.txt + + - name: build Python wheel + shell: bash + run: | + cd python/geobrix + python -m build + ls -lh dist/*.whl + + # ---- docs ---------------------------------------------------------- + # Docusaurus static-zip build, version-named from the JAR we just + # produced. This replaces the previously-committed + # resources/static/geobrix-docs-*.zip — the doc bundle is now a + # release-time output, since release IS the natural time to cut a + # static docs snapshot. + - name: Setup Node + uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # v6.3.0 + with: + node-version: "20" + cache: "npm" + cache-dependency-path: docs/package-lock.json + + # JFrog auth ran earlier; that step writes npm credentials too (per + # deploy-docs.yml's pattern), so `npm ci` below will route through + # the JFrog mirror. + - name: Install docs dependencies + shell: bash + run: cd docs && npm ci + + - name: Build static docs zip + shell: bash + run: | + set -euo pipefail + # Parse version up front so we can name the zip — the repackage + # step below also derives it, but doing it here too lets us name + # the docs zip in a single shell invocation. + JAR=$(ls target/geobrix-*-jar-with-dependencies.jar 2>/dev/null | head -1) + VERSION=$(basename "$JAR" | sed -nE 's/^geobrix-(.+)-jar-with-dependencies\.jar$/\1/p') + if [ -z "$VERSION" ]; then + echo "could not parse version from JAR name: $(basename "$JAR")" >&2 + exit 1 + fi + + ( cd docs && npm run build:static-zip ) + + mkdir -p dist + DOCS_ZIP="dist/geobrix-docs-${VERSION}.zip" + # Strip DS_Store noise; -q to keep run log readable. + ( cd docs/build-static-zip && zip -qr "../../$DOCS_ZIP" . -x "*.DS_Store" ) + + echo "GBX_DOCS_ZIP=$DOCS_ZIP" >> "$GITHUB_ENV" + ls -lh "$DOCS_ZIP" + + - name: repackage platform tarball with JAR + shell: bash + run: | + set -euo pipefail + JAR=$(ls target/geobrix-*-jar-with-dependencies.jar 2>/dev/null | head -1) + if [ -z "$JAR" ]; then + echo "no geobrix-*-jar-with-dependencies.jar found in target/" >&2 + exit 1 + fi + + VERSION=$(basename "$JAR" | sed -nE 's/^geobrix-(.+)-jar-with-dependencies\.jar$/\1/p') + if [ -z "$VERSION" ]; then + echo "could not parse version from JAR name: $(basename "$JAR")" >&2 + exit 1 + fi + echo "GeoBrix version: $VERSION" + + mkdir -p dist staging + rm -rf staging/bundle + mkdir -p staging/bundle + + tar -xzf resources/static/geobrix-gdal-platform-noble.tar.gz \ + -C staging/bundle --strip-components=1 + + cp "$JAR" "staging/bundle/$(basename "$JAR")" + + ( cd staging/bundle && \ + rm -f SHA256SUMS && \ + find . -type f ! -name SHA256SUMS -print0 \ + | LC_ALL=C sort -z \ + | xargs -0 sha256sum > SHA256SUMS ) + + TARBALL=geobrix-gdal-artifacts-v${VERSION}-noble.tar.gz + tar --sort=name --mtime='UTC 2020-01-01' \ + --owner=0 --group=0 --numeric-owner \ + -czf "dist/$TARBALL" -C staging bundle/ + ( cd dist && sha256sum "$TARBALL" > "$TARBALL.sha256" ) + + echo "GBX_VERSION=$VERSION" >> "$GITHUB_ENV" + echo "GBX_TARBALL=$TARBALL" >> "$GITHUB_ENV" + + - name: show release manifest + shell: bash + run: | + echo "=== files to publish ===" + ls -lh \ + "target/geobrix-${GBX_VERSION}-jar-with-dependencies.jar" \ + "python/geobrix/dist/"*.whl \ + "dist/${GBX_TARBALL}" \ + "dist/${GBX_TARBALL}.sha256" \ + scripts/geobrix-gdal-init.sh \ + "${GBX_DOCS_ZIP}" + echo + echo "=== outer sidecar ===" + cat "dist/${GBX_TARBALL}.sha256" + + - name: upload as workflow artifacts + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: geobrix-release-artifacts + path: | + target/geobrix-*-jar-with-dependencies.jar + python/geobrix/dist/*.whl + dist/geobrix-gdal-artifacts-*.tar.gz + dist/geobrix-gdal-artifacts-*.tar.gz.sha256 + scripts/geobrix-gdal-init.sh + dist/geobrix-docs-*.zip + if-no-files-found: error + retention-days: 30 + + - name: attach to tag + if: ${{ inputs.attach_to_tag != '' }} + shell: bash + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + case "$INPUT_ATTACH_TO_TAG" in + v[0-9]*|[0-9]*) : ;; + *) + echo "attach_to_tag '$INPUT_ATTACH_TO_TAG' doesn't look like a version tag." >&2 + exit 1 + ;; + esac + + JAR=$(ls target/geobrix-*-jar-with-dependencies.jar | head -1) + WHL=$(ls python/geobrix/dist/*.whl | head -1) + + gh release upload "$INPUT_ATTACH_TO_TAG" \ + "$JAR" \ + "$WHL" \ + "dist/${GBX_TARBALL}" \ + "dist/${GBX_TARBALL}.sha256" \ + scripts/geobrix-gdal-init.sh \ + "${GBX_DOCS_ZIP}" \ + --clobber + + echo "Attached to tag $INPUT_ATTACH_TO_TAG:" + echo " $(basename "$JAR")" + echo " $(basename "$WHL")" + echo " ${GBX_TARBALL}" + echo " ${GBX_TARBALL}.sha256" + echo " geobrix-gdal-init.sh" + echo " $(basename "${GBX_DOCS_ZIP}")" diff --git a/.gitignore b/.gitignore index 94ae5b5..593f14e 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,10 @@ /target/ /spark-warehouse/ /artifacts/ +# Local output of scripts/build-gdal-artifacts.sh — the platform tarball +# + sidecar are moved into resources/static/ (committed via Git LFS); +# everything else in dist/ is intermediate (extracted bundle, etc.). +/dist/ /python/geobrix/artifacts/ /python/geobrix/test/vectorx/spark-warehouse/ /python/geobrix/test/gridx/artifacts/ diff --git a/docs/docs/beta-release-notes.mdx b/docs/docs/beta-release-notes.mdx index 2ed4345..4b7772e 100644 --- a/docs/docs/beta-release-notes.mdx +++ b/docs/docs/beta-release-notes.mdx @@ -24,6 +24,7 @@ Released 2026-05-19. Per-version highlights; full migration tables are in the pe - **Scalar args without `f.lit(...)`.** Python wrappers auto-wrap `bool` / `int` / `float` / `bytes`; Scala adds typed overloads. SQL was already natively-typed. String literals still wrap in `f.lit(...)` per pyspark's column-ref convention. Details and migration examples in [Scalar values vs `lit(...)` wrapping](#scalar-values-vs-lit-wrapping). - **Example notebooks — EO Series, xView, and enablement diagrams.** New end-to-end walkthroughs under `docs/examples/` covering EO time-series, xView object-detection rasters, and RasterX architecture diagrams. - **Supply-chain hardening (lockdown).** Jobs pinned to the Databricks-hardened runner group (org-level allowlist, ephemeral VMs, constrained secret access); every Maven dependency, transitive dep, plugin, and plugin dependency is PGP-verified against `.maven-keys.list` before any compile or test execution; pip and Maven routed through JFrog with OIDC; init script + pinned package versions vetted; new [Security](./security.mdx) page in the docs. +- **Pre-built, hash-verified GDAL bundle.** The GDAL native install path is now a CI-built tarball (`geobrix-gdal-artifacts-v-noble.tar.gz` + matching `.sha256` sidecar, attached to each release alongside a versioned `geobrix-gdal-init.sh`). Cluster start drops from ~15 minutes (legacy PPA dance per boot) to ~30–90 seconds (verify sidecar → extract → `dpkg -i`). Trust chain is now four layers: CI-side GPG fingerprint pin → per-file `SHA256SUMS` inside the tarball → outer `.sha256` sidecar in the staging Volume → the Volume's write ACL. The legacy on-cluster path is preserved as [`scripts/geobrix-gdal-init-ppa.sh`](https://github.com/databrickslabs/geobrix/blob/main/scripts/geobrix-gdal-init-ppa.sh) for bundle bootstrapping. Bundle is `amd64` / `x86_64` only (Intel or AMD CPUs); ARM-based instance types — AWS Graviton, Ampere, Apple Silicon — are not supported. See [Installation](./installation) and the rationale on the [Security](./security.mdx#pinned-gdal-native--multi-layer-trust-chain) page. --- diff --git a/docs/docs/developers.mdx b/docs/docs/developers.mdx index dbefc95..9fe8a79 100644 --- a/docs/docs/developers.mdx +++ b/docs/docs/developers.mdx @@ -39,6 +39,41 @@ GeoBrix is a multi-artifact repo: Scala/JVM core, Python bindings, docs, and too Development and CI use a **Docker** image (`geobrix-dev`) for a consistent environment; many Cursor commands run inside that container. +### Git LFS — required to clone the GDAL platform tarball + +The GDAL platform tarball at `resources/static/geobrix-gdal-platform-noble.tar.gz` (~90 MB, ships in every GeoBrix release as the runtime GDAL bundle) is stored via **Git LFS** so the binary lives in LFS storage instead of the git pack. The matching `.sha256` sidecar is small enough to live in git directly and is NOT LFS-tracked. The tracking rule is in [`.gitattributes`](https://github.com/databrickslabs/geobrix/blob/main/.gitattributes) at the repo root. + +#### One-time install per machine + +```bash +brew install git-lfs # macOS; or apt-get install git-lfs on Debian/Ubuntu +git lfs install # writes LFS filters into ~/.gitconfig +``` + +#### Cloning the repo + +After `git lfs install`, a normal `git clone` of geobrix automatically fetches LFS objects: + +```bash +git clone git@github.com:databrickslabs/geobrix.git +``` + +If you cloned **before** installing git-lfs, run `git lfs pull` from inside the working tree to fetch the binary. Without that step, `resources/static/geobrix-gdal-platform-noble.tar.gz` will be a ~130-byte LFS pointer file rather than the real 90 MB tarball, and the [`package-geobrix-artifacts.yml`](https://github.com/databrickslabs/geobrix/blob/main/.github/workflows/package-geobrix-artifacts.yml) workflow's `lfs: true` checkout will fail an integrity check. + +#### Updating the platform tarball + +Rebuild only when `GDAL_PPA_VERSION` changes, when DBR moves to a new Ubuntu LTS, or for a security advisory against one of the bundled libs. See [`resources/static/README.md`](https://github.com/databrickslabs/geobrix/blob/main/resources/static/README.md) for the full Docker-based recipe. The short version: + +1. Run [`scripts/build-gdal-artifacts.sh --platform-only`](https://github.com/databrickslabs/geobrix/blob/main/scripts/build-gdal-artifacts.sh) inside a fresh `ubuntu:24.04` container (Docker recipe in the README). +2. Move the resulting `geobrix-gdal-platform-noble.tar.gz` + `.sha256` from `dist/` into `resources/static/`. +3. `git add resources/static/geobrix-gdal-platform-noble.tar.gz` — the LFS filter intercepts via `.gitattributes`. Verify with `git lfs ls-files` (should list the tarball) and `git diff --cached --stat resources/static/geobrix-gdal-platform-noble.tar.gz` (should show ~3 lines added — the pointer — not 90 MB). +4. `git add resources/static/geobrix-gdal-platform-noble.tar.gz.sha256` — committed normally, not LFS. +5. Open a PR. The reviewer **re-runs the build script locally in their own `ubuntu:24.04` container** and confirms the resulting sha256 matches the committed sidecar before approving — that PR review is the trust anchor for every cluster that subsequently installs from this bundle. See [Security](./security#pinned-gdal-native--multi-layer-trust-chain) for the full chain. + +#### Storage considerations + +LFS bandwidth and storage come from the `databrickslabs` GitHub org quota. Each tarball bump consumes both. Don't rebuild the tarball just to bump GeoBrix versions — the release workflow grafts the per-release JAR onto the committed platform tarball without changing it. + ### Testing on a Databricks cluster You can run the **Essential bundle** and **primitive Volume tests** on a live Databricks cluster so that Volume paths are FUSE-mounted and the bundle uses pathlib/shutil only (no Databricks Files API). diff --git a/docs/docs/installation.mdx b/docs/docs/installation.mdx index 8635b7a..41cc1e9 100644 --- a/docs/docs/installation.mdx +++ b/docs/docs/installation.mdx @@ -16,45 +16,97 @@ GeoBrix currently offers heavy-weight, distributed APIs, primarily written in Sc targets the Ubuntu 24.04 (`noble`) base used by DBR 17.3 LTS; other runtimes use different bases and will either fail to install or silently fall back to an unpinned GDAL. -- Classic Databricks cluster (not Serverless) -- GDAL native libraries (installed by the init script below) +- Classic Databricks cluster (not Serverless). +- **Non-ARM instance type (Intel or AMD x86_64).** The GDAL bundle ships + `amd64` `.deb` packages from the UbuntuGIS PPA — `amd64` and `x86_64` + are the same architecture (just Debian's name vs. the kernel's), and + Intel and AMD CPUs are interchangeable. ARM-based instance types — AWS + Graviton, Ampere, Apple Silicon — are not supported; the init script + refuses to run on `aarch64`. +- GDAL native libraries (installed by the init script below). + +:::note Init script is versioned per release +The script shown on this page is the **latest** version, imported live from +`main`. Each GeoBrix release attaches its own copy of `geobrix-gdal-init.sh` +to the [release page](https://github.com/databrickslabs/geobrix/releases) so +the script and bundle that ship in that release are unambiguously paired. +If you're installing a specific GeoBrix version, prefer the init script +from that release's assets over the version shown below. +::: ## Installation Steps -### 1. Download GeoBrix Artifacts +### 1. Download Release Artifacts -GeoBrix requires the following artifacts: +Each GeoBrix [release](https://github.com/databrickslabs/geobrix/releases) +attaches three files for the GDAL install path: -- **JAR file**: `geobrix-*-jar-with-dependencies.jar` -- **Shared Object**: `libgdalalljni.so` (GDAL native library) -- **Python Wheel**: `geobrix-*-py3-none-any.whl` +- `geobrix-gdal-artifacts-v-noble.tar.gz` — the bundle (JAR, + `libgdalalljni.so`, GDAL `.deb`s, Python wheels — everything pre-built and + pre-verified in CI). +- `geobrix-gdal-artifacts-v-noble.tar.gz.sha256` — sidecar file + containing the tarball's sha256. The init script reads this at cluster + start as its trust anchor. +- `geobrix-gdal-init.sh` — the cluster init script for this release. -These are currently delivered via [Releases](https://github.com/databrickslabs/geobrix/releases) artifacts. +Download all three. Verify the tarball locally before uploading anywhere: -### 2. Upload to Databricks Volume +```bash +sha256sum -c geobrix-gdal-artifacts-v-noble.tar.gz.sha256 +``` -1. Create or use an existing Databricks Volume -2. Upload the following files to your Volume (`*` for version): - - `geobrix-*-jar-with-dependencies.jar` - - `libgdalalljni.so` - - `geobrix-*-py3-none-any.whl` +### 2. Upload Bundle to a Unity Catalog Volume -### 3. Create Init Script +Upload the **tarball and the sidecar** (not the init script) to a Unity +Catalog Volume your workspace owns. The init script discovers them there. -GeoBrix requires GDAL natives, which are currently best installed via an init script on a classic cluster. +``` +/Volumes////geobrix-gdal-artifacts-v-noble.tar.gz +/Volumes////geobrix-gdal-artifacts-v-noble.tar.gz.sha256 +``` -1. Use the init script from the repo: [scripts/geobrix-gdal-init.sh](https://github.com/databrickslabs/geobrix/blob/main/scripts/geobrix-gdal-init.sh) (shown below) -2. Modify the `VOL_DIR` variable to point to your Volume location where you uploaded the artifacts -3. Upload the modified init script to your Databricks Volume +The Volume's write ACL is the trust boundary that lets the cluster trust +the sidecar — keep write access to that Volume restricted to the +release/operator process. -**What the script does** (see [Security](./security) for the full rationale): +### 3. Stage the Init Script + +The init script is **not** uploaded to the bundle Volume; it's the cluster's +init-script attachment. -- Adds the UbuntuGIS PPA using an inline-embedded signing key, verified against the expected fingerprint (`UBUNTUGIS_FPR`) before any package install. -- Installs the pinned GDAL package version (`GDAL_PPA_VERSION`) from the verified PPA so the native version matches the JNI binding shipped in the JAR. -- Installs the Python `GDAL` wheel from source against the pinned headers (no opportunistic wheel from PyPI). -- Copies `libgdalalljni.so` and `geobrix-*-jar-with-dependencies.jar` from `VOL_DIR` into place. +1. Upload the release's `geobrix-gdal-init.sh` to a workspace file path or + a separate Volume the cluster can read. +2. Edit `VOL_DIR` (one line near the top of the script) to point at the + Volume from step 2. +3. No other edits are required — the script reads the expected tarball + hash from the sidecar at runtime, so a future bundle bump doesn't + require a new script. + +**What the script does** (see [Security](./security) for the full rationale): -`VOL_DIR` is the only line you should edit. +- Verifies the cluster architecture is `amd64` (fails fast on ARM). +- Discovers the `*.tar.gz.sha256` sidecar in `VOL_DIR` and verifies the + tarball matches. +- Extracts the bundle to local disk and verifies the per-file + `SHA256SUMS` manifest inside. +- Installs the bundled `.deb`s (including the UbuntuGIS-PPA versions of + `libgdal`, `libproj`, `libgeos`, `proj-data`, etc.) via `dpkg -i`. +- Installs the pre-built, source-compiled GDAL Python wheel and the + pinned pip toolchain from the bundled `wheels/` directory — no PyPI + round-trip. +- Copies `libgdalalljni.so` and `geobrix-*-jar-with-dependencies.jar` + from the extracted bundle into place. +- Logs the installed `libgdal*` / `libproj*` / `libgeos*` versions to + the init-script stdout log for later audit. + +:::note GeoBrix Python wheel is installed separately +The init script does **not** install `dblabs_geobrix-*-py3-none-any.whl` +— that wheel is attached as a cluster Library via the Databricks UI +(see step 4 below). Only the GDAL toolchain wheels bundled inside the +release tarball (`pip`, `setuptools`, `wheel`, `cython`, `numpy`, `GDAL`) +are installed by this script. This lets the GeoBrix Python wheel be +bumped independently of the GDAL platform layer. +::: @@ -132,12 +184,43 @@ If you see the GeoBrix functions listed, your installation is successful! ### GDAL Library Issues -The init script installs GDAL from the [UbuntuGIS](https://launchpad.net/~ubuntugis/+archive/ubuntu/ubuntugis-unstable) PPA using a pinned package version (`GDAL_PPA_VERSION`) and a fingerprint-verified signing key (`UBUNTUGIS_FPR`). Common failure modes: +The init script installs GDAL from a CI-built bundle staged in your Unity Catalog Volume, verified against a sidecar sha256 file. The CI build itself uses a fingerprint-pinned UbuntuGIS PPA key. Common failure modes: + +1. **Tarball sha256 mismatch** — the script exits with a `sha256sum: WARNING: 1 computed checksum did NOT match` error. The tarball in `VOL_DIR` doesn't match the sidecar staged alongside it: either an upload was truncated, the wrong tarball was uploaded, or the sidecar is stale. Re-download both the tarball and the `.sha256` sidecar from the GeoBrix [release page](https://github.com/databrickslabs/geobrix/releases) and re-upload as a pair. +2. **No `*.tar.gz.sha256` sidecar found** — the operator uploaded the tarball but not its sidecar (or vice versa). Both must be present in `VOL_DIR`. If multiple sidecars are present, the script refuses to install — keep only the active version. +3. **Unsupported architecture** — the script exits with `Unsupported architecture: arm64` or similar. The bundle ships `amd64` / `x86_64` `.deb`s only (Intel or AMD CPUs). Choose a non-ARM instance type for the cluster. +4. **DBR base image mismatch** — the script targets the Ubuntu 24.04 (`noble`) base used by DBR 17.3 LTS. +5. **General checks** — confirm the init script ran successfully (driver logs) and that `libgdalalljni.so` is in `/usr/lib` on the driver and executors. The script's tail-of-log section prints the installed `libgdal*` / `libproj*` / `libgeos*` versions — compare against `PACKAGES.txt` inside the tarball to confirm the cluster is on the expected versions. + +#### Reading init logs when the cluster fails to launch + +If the cluster never launches, Databricks's normal `/databricks/init_scripts/...` logs die with the failing node. To recover them, the init script writes a persistent copy of its full stdout/stderr to a per-host subdirectory under your `VOL_DIR`: + +``` +$VOL_DIR/_init_logs///init.log +$VOL_DIR/_init_logs///_NN_.txt +``` + +Filenames are **stable per host** — within a single cluster launch, each host overwrites its own files in place. On every cluster launch the driver also wipes the entire cluster-level log dir (`$VOL_DIR/_init_logs//`) so stale per-host subdirs from earlier launches don't accumulate over time. (Important for autoscaling clusters where worker hostnames change between launches.) Workers themselves only clear their own subdir, so they don't race with peer workers or with the still-running driver during the initial startup window. + +`init.log` is the full output; `_NN_.txt` are small breadcrumb files marking how far the script got (one per major step: sidecar verify, tarball extract, dpkg, pip, JAR install, etc.) and `_99_trap_*.txt` markers indicating whether the EXIT trap fired and what the cp result was. + +From any working cluster in the same workspace: + +```bash +%sh +# List files for one failing host +find /Volumes//_init_logs// -type f | sort + +# Read the full log +cat /Volumes//_init_logs///init.log +``` + +The highest-numbered `_NN_*.txt` breadcrumb tells you the last successful step; `init.log` has the full output up to (and including) the failure. To override the log location (e.g. write to a dedicated logs volume or a Workspace files path), set `WS_LOG_DIR` as a cluster env var or edit the constant near the top of the init script. -1. **Fingerprint mismatch** — the script exits with `ubuntugis key fingerprint mismatch: got='…' expected='…'`. The embedded key block or `UBUNTUGIS_FPR` has been tampered with, or Launchpad rotated its signing key. Re-download the init script from the [release artifacts](https://github.com/databrickslabs/geobrix/releases) and re-verify the fingerprint at [Launchpad's API](https://launchpad.net/api/1.0/~ubuntugis/+archive/ubuntu/ubuntugis-unstable). See the [Security](./security) page for the rationale. -2. **Pinned package version unavailable** — `apt-get install` fails with "Version `…` not found". The PPA has retired that build. Use a newer GeoBrix release whose init script pins a still-available version, rather than locally editing the pin (the JNI binding in the JAR must match). -3. **DBR base image mismatch** — the script targets the Ubuntu 24.04 (`noble`) base used by DBR 17.3 LTS. -4. **General checks** — confirm the init script ran successfully (driver logs) and that `libgdalalljni.so` is in `/usr/lib` on the driver and executors. Adding `/usr/lib` to `LD_LIBRARY_PATH` is rarely needed. +:::note Legacy PPA-based init script +If you need the legacy on-cluster PPA install (slow — typically 10–15 minutes of cluster start time, used historically and still useful for bootstrapping a new bundle), see [`scripts/geobrix-gdal-init-ppa.sh`](https://github.com/databrickslabs/geobrix/blob/main/scripts/geobrix-gdal-init-ppa.sh). It is not the recommended path for new clusters. +::: ### Function Registration Issues diff --git a/docs/docs/limitations.mdx b/docs/docs/limitations.mdx index 27c143f..dae245f 100644 --- a/docs/docs/limitations.mdx +++ b/docs/docs/limitations.mdx @@ -52,11 +52,47 @@ Spatial K-Nearest Neighbors is not yet ported: - No KNN operations currently available - H3 support for Geometry-based K-Ring and K-Loop not included +## Coordinate Reference Systems (PROJ version skew) + +GeoBrix's GDAL stack and DBR's built-in spatial functions each carry +their own PROJ runtime — they coexist on the cluster but are not the +same library, and the two PROJ versions can diverge slightly: + +- **GeoBrix's GDAL** links to the PROJ shipped in the cluster init + bundle (currently PROJ 9.4.1, from the UbuntuGIS PPA build of + `libgdal37`). Installed at `/usr/lib/x86_64-linux-gnu/libproj.so.25`. +- **DBR 17.3 LTS's built-in `ST_*` functions** link to PROJ 9.7.1, + bundled with the runtime under `/databricks/native/` (with + `PROJ_DATA=/databricks/native/proj-data`). Not registered with + `ldconfig` and not installed via apt — fully isolated from + `/usr/lib`. + +The two run side-by-side in the same JVM/Python process without +conflict, because they're loaded from distinct paths. The cost is +**CRS-catalog skew at the edges**: any EPSG code added or refined +between PROJ 9.4.1 and 9.7.1 (newly-added projections, updated +grid-shift definitions, transformation pipeline changes) may be +interpreted slightly differently by a GeoBrix function vs. a DBR +built-in operating on the same geometry. + +For the common EPSG codes you're most likely to use day-to-day — +`EPSG:4326`, `EPSG:3857`, `EPSG:27700`, the UTM zones — this is +invisible. For freshly-added projections or very precise datum +transformations, it may surface. + +If you run into a CRS-related discrepancy between a GeoBrix function +and a DBR `ST_*` built-in on the same input, please +[file an issue](https://github.com/databrickslabs/geobrix/issues) with +the EPSG code and a minimal repro — that's the signal we'd use to +prioritize rebuilding the GeoBrix GDAL stack from source against DBR's +PROJ in a future release. + ## Compute Requirements GeoBrix requires Databricks Classic Clusters: - **Not** currently compatible with Serverless compute - Requires GDAL native libraries via init script, which are currently only supported on classic clusters +- **Non-ARM instance types only (Intel or AMD x86_64).** The GDAL bundle ships `amd64` `.deb`s from the UbuntuGIS PPA — `amd64` and `x86_64` are the same architecture, and Intel and AMD CPUs are interchangeable. ARM-based instance types — AWS Graviton, Ampere, Apple Silicon — are not supported. The init script fails fast on `aarch64`. Databricks Runtime: - **Minimum**: DBR 17.1 (recommend DBR 17.3 LTS or later) diff --git a/docs/docs/security.mdx b/docs/docs/security.mdx index 40f9ca4..a80eb68 100644 --- a/docs/docs/security.mdx +++ b/docs/docs/security.mdx @@ -64,23 +64,59 @@ GDAL is the documented exception: its Python wheel must match the GDAL native version installed on the host, so it is installed separately against the detected version. The native side is pinned via the init script (see below). -### Pinned GDAL native + fingerprint-pinned PPA key - -The [cluster init script](./installation#3-create-init-script) installs GDAL -from the UbuntuGIS PPA — but instead of trusting whatever signing key -Launchpad happens to serve (the default `add-apt-repository` flow), the script -embeds the expected key inline and **refuses to proceed unless the key's -fingerprint matches `UBUNTUGIS_FPR`**. A tampered cluster image, a swapped key -block in the script, or a Launchpad MITM all fail closed before any GDAL -package is installed. - -On top of the key-fingerprint check, the script also: - -- Pins the GDAL package version (`GDAL_PPA_VERSION`) to the exact `apt` - release that matches the JNI binding shipped in the GeoBrix JAR. -- Installs the Python `GDAL` wheel with `--no-binary :all:` against those - pinned `apt` headers, so the wheel is compiled from source on the cluster - rather than accepting whatever pre-built wheel PyPI happens to serve. +### Pinned GDAL native + multi-layer trust chain + +As of v0.3.0, the GDAL native install path is **pre-built in CI and +distributed as a single signed tarball** rather than fetched and compiled +on every cluster start. This both strengthens the supply-chain story (the +PPA round-trip happens once, in a controlled environment) and cuts cluster +start time from ~15 minutes to ~30–90 seconds. + +The trust chain has four layers, each gating the next: + +1. **GPG fingerprint check + reviewed Git-LFS commit (upstream of CI).** + The build script + [`scripts/build-gdal-artifacts.sh`](https://github.com/databrickslabs/geobrix/blob/main/scripts/build-gdal-artifacts.sh) + embeds the UbuntuGIS signing key inline and refuses to proceed unless + its fingerprint matches `UBUNTUGIS_FPR` — the same check the legacy + on-cluster script ran, moved upstream. The resolved runtime `.deb` + set (PPA versions of `libgdal`, `libproj`, `libgeos`, `proj-data`, + and other transitive runtime deps) plus the source-compiled GDAL + Python wheel (`--no-binary :all:` against the PPA headers) and the + `libgdalalljni.so` JNI are packaged into a single platform tarball. + + That platform tarball is **committed to the repository under + `resources/static/geobrix-gdal-platform-noble.tar.gz` via Git LFS**. + The PR that adds or updates it is the single human-review checkpoint + for the bytes that will ship to every cluster — reviewers re-run the + build script locally in `ubuntu:24.04` and compare the resulting + sha256 to the committed sidecar before approving. Subsequent + GeoBrix releases reuse that committed platform tarball (the release + workflow grafts the per-release JAR onto a copy of it); no new PPA + round-trip happens unless the platform tarball itself is bumped. + +2. **Per-file `SHA256SUMS` manifest (inside the tarball).** Every `.deb`, + `.whl`, `.so`, and `.jar` in the bundle is hashed at build time. The + cluster verifies the manifest after extraction; any per-file + tampering or transport corruption fails closed before install. + +3. **Outer tarball sha256 sidecar (staged in your UC Volume).** A + `.sha256` sidecar file is published alongside the tarball in + each GeoBrix release. The operator uploads both files to the Volume; + the cluster init script reads the sidecar at runtime and verifies the + tarball against it. A tampered tarball fails closed before extraction. + +4. **Unity Catalog Volume ACL (your environment).** The Volume's write + permission is the boundary that lets the cluster trust the sidecar. + Only the release/operator process should be able to write to the + staging Volume; read access is broader (clusters need it). This is + the only layer of the chain that lives in your workspace — keep + write access tightly scoped. + +The legacy on-cluster install path is still available as +[`scripts/geobrix-gdal-init-ppa.sh`](https://github.com/databrickslabs/geobrix/blob/main/scripts/geobrix-gdal-init-ppa.sh) +(slower, runs the PPA dance every boot) for bootstrapping new bundles or +debugging. ### Hardened, ephemeral CI runners @@ -112,28 +148,39 @@ The upstream controls above protect the artifacts we ship. The controls below are what you can do at install time and at runtime to keep the same guarantees in your environment. -### 1. Use the released init script verbatim - -The PGP fingerprint check, GDAL version pin, and source-only Python install -are load-bearing for the supply-chain story. The only line you should change -in [`scripts/geobrix-gdal-init.sh`](https://github.com/databrickslabs/geobrix/blob/main/scripts/geobrix-gdal-init.sh) -is `VOL_DIR`. Replacing it with a homegrown GDAL installer drops those guarantees -on your cluster. - -### 2. Stage release artifacts in a Volume you control - -You don't have to fetch the JAR, wheel, and `libgdalalljni.so` from the internet on every -cluster start. The recommended flow is: - - - Download the artifacts from the - [GitHub release page](https://github.com/databrickslabs/geobrix/releases). - - Verify the sha256 of each asset against what the release page publishes. - - Upload the verified files to a Unity Catalog Volume your workspace owns. - - Set `VOL_DIR` in the init script to that Volume path. - - Refresh artifacts on a controlled cadence — not automatically. - -This puts the artifacts on storage you control, with the access policy you've -already approved. +### 1. Use the init script from the matching release verbatim + +Each GeoBrix release attaches its own `geobrix-gdal-init.sh` to the +[release page](https://github.com/databrickslabs/geobrix/releases) so the +script and tarball that ship together are unambiguously paired. The docs +import the **latest** script from `main` for reference; for a specific +GeoBrix version, use the script attached to that release. + +The architecture check, the sidecar verification, the per-file +`SHA256SUMS` verification, and the offline `pip install` are load-bearing +for the trust chain. The only line you should change in the script is +`VOL_DIR`. Replacing it with a homegrown GDAL installer drops those +guarantees on your cluster. + +### 2. Stage the tarball + sidecar in a Volume you control + +Both files belong in a Unity Catalog Volume whose **write ACL you've +restricted** to the release/operator process. The cluster only needs read +access. The recommended flow is: + + - Download `geobrix-gdal-artifacts-v-noble.tar.gz` and its + matching `.sha256` sidecar from the + [GitHub release page](https://github.com/databrickslabs/geobrix/releases). + - Verify the tarball locally: `sha256sum -c .sha256`. + - Upload **both files** to the Volume — the cluster init script reads + the sidecar at runtime to know which tarball to expect and what hash + to verify against. + - Refresh on a controlled cadence (when bumping GeoBrix versions or + applying a security patch), not automatically. + +The init script lives separately — typically as a workspace file or in a +separate Volume the cluster reads as its init-script attachment. Pair it +with the bundle by GeoBrix version. ### 3. Pin the GeoBrix version in your cluster libraries diff --git a/python/geobrix/requirements-build.in b/python/geobrix/requirements-build.in new file mode 100644 index 0000000..052dd1f --- /dev/null +++ b/python/geobrix/requirements-build.in @@ -0,0 +1,19 @@ +# Minimal hash-pinned Python deps for the wheel-build path in +# .github/workflows/package-geobrix-artifacts.yml. Trimmed down from +# requirements-ci.in to just the PEP 517 frontend + backend that +# `python -m build` needs — no pytest, coverage, lint tooling, or +# scientific stack. Saves ~30s vs. installing the full CI dep set. +# +# Edit this file, then regenerate the hash-pinned lock: +# +# cd python/geobrix +# uv pip compile --generate-hashes --python-version 3.12 \ +# --output-file requirements-build.txt requirements-build.in +# +# Versions are deliberately the same as the corresponding lines in +# requirements-ci.in so the two lockfiles can't disagree on what build +# backend version produced any given wheel. If you bump a version here, +# bump it there too (and vice versa) — they're a paired set. +build==1.4.4 +setuptools==80.9.0 # >= 77.0.0 required to parse PEP 639 SPDX license strings (matches requirements-ci.in note) +wheel==0.45.1 diff --git a/resources/static/README.md b/resources/static/README.md new file mode 100644 index 0000000..9381c1a --- /dev/null +++ b/resources/static/README.md @@ -0,0 +1,92 @@ +# `resources/static/` + +Committed binary assets that ship with GeoBrix releases. Reviewed at the commit +that adds or updates them; release workflows treat the bytes here as +authoritative and do no further rebuilding. + +## `geobrix-gdal-platform-noble.tar.gz` (Git LFS) + +The slowly-changing GDAL native install bundle: UbuntuGIS-PPA `.deb`s +(`libgdal36`, `libgdal-dev`, `gdal-bin`, `python3-gdal`, `libproj*`, +`libgeos*`, `proj-data`, `libspatialite`, `libnetcdf`, `libhdf5`, plus +transitive deps), the source-compiled GDAL Python wheel, the pip-toolchain +wheels (pip, setuptools, wheel, cython, numpy), and `libgdalalljni.so`. + +The matching `geobrix-gdal-platform-noble.tar.gz.sha256` sidecar is +committed alongside. + +This file is **the trust anchor for the cluster GDAL install** — +[`scripts/geobrix-gdal-init.sh`](../../scripts/geobrix-gdal-init.sh) verifies a +release-time repackage of these bytes (plus the per-release JAR) on every +cluster start. + +### When to rebuild + +- The pinned `GDAL_PPA_VERSION` in + [`scripts/build-gdal-artifacts.sh`](../../scripts/build-gdal-artifacts.sh) + changes. +- A new Ubuntu LTS becomes the DBR base image (e.g. noble → 26.04). +- A security advisory against one of the bundled libraries. + +Otherwise leave it alone — it's reused across many GeoBrix releases. + +### How to rebuild + +The build is deliberately **local** (not CI) so the reviewer can reproduce +it byte-for-byte against the same Docker base image and confirm the +fingerprint check passed before the bytes are committed. From a host with +Docker: + +```bash +# 1. Build the platform bundle in a fresh noble container. +# --platform-only skips the per-release JAR step so the output is +# reusable across many GeoBrix versions. +# --out points UNDER the mounted /work tree so the tarball survives +# the container exit. +# --platform linux/amd64 is REQUIRED on Apple Silicon (M-series Macs) +# so Docker emulates an x86_64 build host. The script refuses to run +# on aarch64 because the resulting .debs would be ARM binaries that +# can't install on the (amd64) Databricks cluster. Emulation makes +# this build ~2-3x slower than native; budget 15-25 min on M-series +# vs 5-10 min on an Intel/AMD Linux host. On native x86 hosts you +# can drop the --platform flag. +mkdir -p dist +# If you're on the Databricks corp network, pypi.org is blocked — mount +# your host ~/.pip/pip.conf so the venv pip routes through +# pypi-proxy.dev.databricks.com (or whatever mirror your .pip/pip.conf +# resolves to). On an unrestricted network you can drop the -v line. +docker run --rm --platform linux/amd64 \ + -v "$PWD":/work -w /work \ + -v "$HOME/.pip":/root/.pip:ro \ + ubuntu:24.04 bash -c ' + apt-get update && apt-get install -y sudo && + ./scripts/build-gdal-artifacts.sh \ + --jni scripts/gdal311/libgdalalljni.so \ + --out /work/dist/gdal-artifacts \ + --platform-only + ' +# Outputs (on the host, next to dist/gdal-artifacts/): +# dist/geobrix-gdal-platform-noble.tar.gz +# dist/geobrix-gdal-platform-noble.tar.gz.sha256 + +# 2. Move both files into resources/static/. +mv dist/geobrix-gdal-platform-noble.tar.gz resources/static/ +mv dist/geobrix-gdal-platform-noble.tar.gz.sha256 resources/static/ + +# 3. Open a PR. The reviewer: +# - Reruns step 1 in their own container. +# - Confirms `sha256sum resources/static/geobrix-gdal-platform-noble.tar.gz` +# matches the rebuild and the committed sidecar. +# - Eyeballs PACKAGES.txt inside the tarball for the libproj/libgeos/etc. +# versions that just shifted. +``` + + + diff --git a/resources/static/geobrix-docs-0.3.0.zip b/resources/static/geobrix-docs-0.3.0.zip deleted file mode 100644 index eae787f..0000000 Binary files a/resources/static/geobrix-docs-0.3.0.zip and /dev/null differ diff --git a/resources/static/geobrix-gdal-platform-noble.tar.gz b/resources/static/geobrix-gdal-platform-noble.tar.gz new file mode 100644 index 0000000..f5933a6 --- /dev/null +++ b/resources/static/geobrix-gdal-platform-noble.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e9057e94f0ba7a8cdc37b2ace5dd12176661ca810d998bd9e415fa1e7fac670 +size 94380383 diff --git a/resources/static/geobrix-gdal-platform-noble.tar.gz.sha256 b/resources/static/geobrix-gdal-platform-noble.tar.gz.sha256 new file mode 100644 index 0000000..e714bc2 --- /dev/null +++ b/resources/static/geobrix-gdal-platform-noble.tar.gz.sha256 @@ -0,0 +1 @@ +0e9057e94f0ba7a8cdc37b2ace5dd12176661ca810d998bd9e415fa1e7fac670 geobrix-gdal-platform-noble.tar.gz diff --git a/scripts/build-gdal-artifacts.sh b/scripts/build-gdal-artifacts.sh new file mode 100755 index 0000000..24ec3d7 --- /dev/null +++ b/scripts/build-gdal-artifacts.sh @@ -0,0 +1,413 @@ +#!/bin/bash +# +# Developer-local tool that produces the slowly-changing GDAL "platform" +# tarball which lives under resources/static/ (tracked by Git LFS) and is +# reused across many GeoBrix releases. Rebuild + commit only when: +# - GDAL_PPA_VERSION below changes +# - A new Ubuntu LTS becomes the DBR base image (noble → 26.04) +# - A security advisory against one of the bundled libraries +# +# Run inside a fresh Ubuntu 24.04 container so the resolved .debs and +# source-built wheel match the DBR 17.3 LTS (noble) cluster base image: +# +# docker run --rm -it -v "$PWD":/work -w /work ubuntu:24.04 bash -c ' +# apt-get update && apt-get install -y sudo && +# ./scripts/build-gdal-artifacts.sh \ +# --jni scripts/gdal311/libgdalalljni.so \ +# --out /tmp/gdal-artifacts \ +# --platform-only +# ' +# +# Then commit the two outputs into resources/static/: +# geobrix-gdal-platform-noble.tar.gz (Git LFS) +# geobrix-gdal-platform-noble.tar.gz.sha256 +# +# Architecture: x86_64 / amd64 only — these two names refer to the same +# instruction set (Intel and AMD CPUs both qualify; `amd64` is just +# Debian's name for `x86_64`). The PPA ships only amd64 binaries; building +# on ARM / aarch64 (Graviton, Ampere, Apple Silicon) would produce a bundle +# the cluster init script refuses to install. The arch check below fails +# fast on the wrong host. +# +# This script is the upstream end of the trust chain: it verifies the +# UbuntuGIS PPA key fingerprint against UBUNTUGIS_FPR, adds the PPA, +# downloads the resolved .deb set, source-builds the GDAL Python wheel +# against those headers, writes a SHA256SUMS manifest inside the bundle, +# then packages everything into a single .tar.gz with a .sha256 sidecar. +# The PR reviewer reruns this script locally and confirms the sha256 +# matches the committed file — that PR-review step IS the trust anchor for +# everything downstream (release workflow, cluster init script, runtime). +# +# Legacy mode: --jar (without --platform-only) produces a release-shape +# tarball with the JAR baked in. Kept for one-off debugging; the canonical +# release flow at .github/workflows/package-geobrix-artifacts.yml grafts the +# per-release JAR into the committed platform tarball itself. + +set -euo pipefail +export DEBIAN_FRONTEND=noninteractive + +# ---- arch guard ---------------------------------------------------------- +# Bail before we burn time downloading amd64 .debs onto an ARM build host. +# Intel and AMD CPUs both report `x86_64` here and are equally supported; +# the exclusion is aarch64 (Graviton, Ampere, Apple Silicon). +HOST_ARCH="$(uname -m)" +if [ "$HOST_ARCH" != "x86_64" ]; then + echo "Unsupported build host architecture: $HOST_ARCH" >&2 + echo "This script produces amd64 / x86_64 artifacts only (Intel or AMD CPUs)." >&2 + echo "Run on a non-ARM builder." >&2 + exit 1 +fi + +# ---- args ---------------------------------------------------------------- +# Two modes: +# default : produce a release-shape tarball with the JAR baked in +# (named geobrix-gdal-artifacts-v-noble.tar.gz). +# Used historically when CI built per-release tarballs. +# --platform-only : produce the slowly-changing platform bundle without +# any JAR, named geobrix-gdal-platform-noble.tar.gz. This +# is the file that gets committed under resources/static/ +# and reused across many releases. The per-release JAR is +# grafted in later by package-geobrix-artifacts.yml. +JNI_PATH="" +JAR_PATH="" +OUT_DIR="" +PLATFORM_ONLY="false" +while [ $# -gt 0 ]; do + case "$1" in + --jni) JNI_PATH="$2"; shift 2 ;; + --jar) JAR_PATH="$2"; shift 2 ;; + --out) OUT_DIR="$2"; shift 2 ;; + --platform-only) PLATFORM_ONLY="true"; shift ;; + -h|--help) + grep '^# ' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;; + *) echo "Unknown arg: $1" >&2; exit 1 ;; + esac +done + +[ -n "$JNI_PATH" ] && [ -f "$JNI_PATH" ] || { echo "--jni PATH (existing file) required" >&2; exit 1; } +[ -n "$OUT_DIR" ] || { echo "--out DIR required" >&2; exit 1; } +if [ "$PLATFORM_ONLY" = "false" ]; then + [ -n "$JAR_PATH" ] && [ -f "$JAR_PATH" ] || { echo "--jar PATH (existing file) required (or pass --platform-only to skip)" >&2; exit 1; } +fi + +# sudo no-op when running as root (typical inside Docker). +SUDO="" +[ "$EUID" -ne 0 ] && SUDO="sudo" + +# ---- pins (keep in lockstep with the runtime scripts and CI actions) ----- +# Same key fingerprint as geobrix-gdal-init.sh — if the embedded block below +# ever drifts from the runtime script's block, the fingerprint check fails +# closed in both places. +UBUNTUGIS_FPR="2EC86B48E6A9F326623CD22FFF0E7BBEC491C6A1" + +# Same GDAL version as geobrix-gdal-init.sh line 100. Bump in lockstep. +GDAL_PPA_VERSION="3.11.4+dfsg-1~noble0" + +# Pip toolchain pins — match .github/actions/{scala,python}_build/action.yml +# and the original geobrix-gdal-init.sh lines 109–110. +PIP_VERSION="25.0.1" +SETUPTOOLS_VERSION="80.9.0" # >= 77.0.0 required to parse PEP 639 SPDX license strings in GDAL 3.11+ sdist (`license = "MIT"`); 74.0.0 fails with `project.license must be valid exactly by one definition`. Keep in lockstep with python/geobrix/requirements-ci.in. +WHEEL_VERSION="0.45.1" +CYTHON_VERSION="3.0.12" +NUMPY_VERSION="2.1.3" + +# Non-PPA system packages the original init script pulled (line 101). We +# bundle them too so cluster start can dpkg -i without any apt-get update. +EXTRA_SYSTEM_PKGS=(unixodbc libcurl3-gnutls libsnappy-dev libopenjp2-7) + +# ---- output dirs --------------------------------------------------------- +OUT_DIR="$(cd "$(dirname "$OUT_DIR")" && pwd)/$(basename "$OUT_DIR")" +rm -rf "$OUT_DIR" +mkdir -p "$OUT_DIR/debs" "$OUT_DIR/wheels" + +# ---- step 1: add UbuntuGIS PPA with fingerprint-pinned key --------------- +# This is the runtime trust anchor's *upstream* end: anything that flows into +# the artifact bundle from the PPA is signed by a key whose fingerprint we +# match against UBUNTUGIS_FPR here. If a future Launchpad key rotation or a +# tampered key block changes the fingerprint, this fails BEFORE any .deb gets +# downloaded into the bundle. + +$SUDO apt-get update -y +$SUDO apt-get install -y ca-certificates gpg lsb-release python3-pip python3-venv + +UBUNTUGIS_KEYRING="/etc/apt/keyrings/ubuntugis.gpg" +UBUNTUGIS_LIST="/etc/apt/sources.list.d/ubuntugis-unstable.list" + +UBUNTUGIS_KEY_ASC="$(mktemp)" +trap 'rm -f "$UBUNTUGIS_KEY_ASC"' EXIT +cat > "$UBUNTUGIS_KEY_ASC" <<'UBUNTUGIS_KEY_EOF' +-----BEGIN PGP PUBLIC KEY BLOCK----- +Comment: Hostname: +Version: Hockeypuck 2.2 + +xsFNBGYzcWcBEACZy6Cs/d6xE5dYOX7MY9nMNGALohNGal+lT/gvuU16NYrXV/qs +7NyOLjUmFuEflrbMbOuqW6XaK8FRCkOCMbJAGcxlieLK7e2oV472rw/fMVJYk9du +ebQoYcNfB4Pylb4xpZvG9+zwWWICMZG8JlcV+hLWAC5L9WY/6GycRZMarukPntY5 +f9r6KMohMtcpiqjtpIccTKbxLwB/wPRTri2+clSG1PABhIhLzQqQv2qIlsVGjt0r +eP1DjoNin0yrBrsNZysVSEQW4/3KEW4PN4VqhoGwrNPygN0dwCyQ/yn+ulFhwzgI +KTGlDkEEn+ozONMIccWjGxck3SCjCCH2QO3UwX10AifChgFoms5mKuE0MLYRqgWK +wPGly5n5yBOhz8ctXRQ7L0613hJ6GiBkZMqOTIdXY4NT52e6tsXTaJ/Jx4VwFg64 +j0qJZ5TE1Z//kSTpEmEELsq0rl3Iz9gxeMqalVhoJXBRKb7MMwJn4p0rjbhp9jWj +4tN26LqwLfCNVPrEomUG7ERG6Rs45CfPOh3bLCm9yd3++bcAGN8ne3F1YABY/kyf +bXtjQ/ihhpFMbqUtcUkEIS8xfbnwdORvH+wmaBbSpaMW1JCJNmM3KsdzY16PsckO +Z7YHAqZacirlNN/dZbsFLow958ssjwgGYquVNhiBckE2vIzObdrcHqsx8QARAQAB +zRtMYXVuY2hwYWQgUFBBIGZvciBVYnVudHVHSVPCwY4EEwEKADgWIQQuyGtI5qnz +JmI80i//Dnu+xJHGoQUCZjNxZwIbAwULCQgHAgYVCgkICwIEFgIDAQIeAQIXgAAK +CRD/Dnu+xJHGoY8RD/9nviKd8w55J7MxUhI3s6ka15BXqKamZ7zmVn+nYNU9QY3V +HK3gh1Z1SytNcS572AZuym1dTGe779zfIchQ6VN8aFwhLTKMyg4FBGP0opYCPEG1 +y2wwcSTNeOyiwPBECYae0tXi9btYB3GswO30GaQXTpKAy0LDaHSm4zfUkKfnofAQ +lZdznTXgxUJqSn8fzFMIY4bDEImgRp1TS5sIavKQKpFLNJKP1bnCl1/YSTm67SOx +rH1Q0URKJIRsgfj/L4Rt1SW8EZqFb9tDHfcfGSpdvD7LWe7NMVYHBn9CUsSMbfW8 +SwBkUAw/6l0ODeKmUNqSbYTia0GBhX/LwsFrc3cydSlX8NZSKwGztM9F+tOHXaS9 +eVap7Ow6dTuaw/fyJIf57PAVSAkmJ41nSAygr4XaleDTJXHE4T0tHWusb3AXdKUR +4bSthlSQKrFnYnLTBKuN5ijQ5TLzFbMjD22JvFpSQeQeGYkjNfmLOcLU1p4pWCM+ +z5EgjOJcGPbjFqlEkMraUPONJuzFdAnx6d7OdGY9TWserSuI8+392mXhU+9SiS8T +nrbb0Y/WYJmcqkQRmwe6eCs7G+3UJhulUKWEYm37255aNiHKJl+FZEgZ9Zh5tsN/ +RrcIov5r9ncdNv8VP6c6IkOCbH9bOo4jto02TV/WMACEcXCVU7nZCdbCYpHCqA== +=cYNc +-----END PGP PUBLIC KEY BLOCK----- +UBUNTUGIS_KEY_EOF + +actual_fpr=$(gpg --show-keys --with-fingerprint --with-colons "$UBUNTUGIS_KEY_ASC" \ + | awk -F: '/^fpr:/ {print $10; exit}') + +if [ -z "$actual_fpr" ] || [ "$actual_fpr" != "$UBUNTUGIS_FPR" ]; then + echo "ubuntugis key fingerprint mismatch: got='${actual_fpr}' expected='${UBUNTUGIS_FPR}'" >&2 + exit 1 +fi + +$SUDO install -d -m 0755 /etc/apt/keyrings +$SUDO gpg --dearmor --yes -o "$UBUNTUGIS_KEYRING" < "$UBUNTUGIS_KEY_ASC" +$SUDO chmod 0644 "$UBUNTUGIS_KEYRING" + +CODENAME="$(lsb_release -sc)" +if [ "$CODENAME" != "noble" ]; then + echo "WARNING: building on '${CODENAME}', not noble — artifacts may not match DBR 17.3 LTS." >&2 +fi + +echo "deb [signed-by=${UBUNTUGIS_KEYRING}] https://ppa.launchpadcontent.net/ubuntugis/ubuntugis-unstable/ubuntu ${CODENAME} main" \ + | $SUDO tee "$UBUNTUGIS_LIST" >/dev/null + +$SUDO apt-get update -y + +# ---- step 2: download the runtime .deb set into the bundle -------------- +# IMPORTANT: request libgdal37 (runtime), NOT libgdal-dev (headers). The +# transitive closure of libgdal-dev pulls in build-time helpers (automake, +# libtool, autotools-dev, etc.) that the cluster's dpkg later fails to +# configure (their post-install scripts depend on packages not in DBR's +# base image). Requesting libgdal37 walks only the runtime side of the +# dep graph — libproj25, libgeos-c1t64, proj-data, libhdf5*, libnetcdf*, +# libspatialite8t64, etc. — which is exactly what the cluster needs. +# +# libgdal-dev IS installed below (step 2b), but only into THIS build +# container so we can source-compile the GDAL Python wheel against its +# headers. Its .debs never enter the bundle. +$SUDO apt-get clean +$SUDO apt-get install -y --reinstall --download-only \ + "libgdal37=${GDAL_PPA_VERSION}" \ + "gdal-bin=${GDAL_PPA_VERSION}" \ + "python3-gdal=${GDAL_PPA_VERSION}" \ + "${EXTRA_SYSTEM_PKGS[@]}" + +# Collect them. /var/cache/apt/archives/partial/ holds in-flight downloads — +# ignore that. Use cp -L just in case any .deb is a symlink (rare but safe). +shopt -s nullglob +debs=(/var/cache/apt/archives/*.deb) +[ "${#debs[@]}" -gt 0 ] || { echo "no .debs downloaded — check apt-get output above" >&2; exit 1; } +$SUDO cp -L "${debs[@]}" "$OUT_DIR/debs/" +$SUDO chown -R "$(id -u):$(id -g)" "$OUT_DIR/debs" + +# Safety-net filter: drop any straggler -dev / autotools / -tools packages +# that may have slipped in. With the step 2 change above (request libgdal37 +# instead of libgdal-dev), apt should walk only the runtime side of the +# transitive graph and these globs should match nothing — but if a future +# DBR runtime lib bump or PPA rebuild surprises us, this keeps the bundle +# clean. Each pattern is something we KNOW is build-time-only: +# *-dev_*.deb : headers + pkg-config (libfoo-dev) +# automake / libtool / autotools-dev : configure script helpers +# libpng-tools : has exact-version pin on libpng16-16t64 that's +# out-of-sync with DBR base +( cd "$OUT_DIR/debs" && \ + rm -f -- *-dev_*.deb \ + automake_*.deb libtool_*.deb autotools-dev_*.deb autoconf_*.deb \ + libpng-tools_*.deb ) + +# Emit a human-readable manifest of every .deb in the bundle. The PPA's +# libgdal is built against the PPA's libproj / libgeos / proj-data, so those +# come in via transitive deps and MUST be present here — otherwise the +# cluster's older system libproj wins at link time and runtime CRS / geometry +# operations break in subtle ways. The grep-and-fail checks below make the +# presence of those transitive PPA packages an explicit invariant rather +# than an implicit consequence of apt's resolver. +# dpkg-deb -W only inspects the FIRST file in its arglist even when many +# are passed — so loop one file at a time. ~200 .debs × ~10 ms each is +# ~2 s, negligible against the wheel build below. +( cd "$OUT_DIR/debs" && \ + for deb in *.deb; do + dpkg-deb -W --showformat='${Package} ${Version} ${Architecture}\n' "$deb" + done | LC_ALL=C sort > "$OUT_DIR/PACKAGES.txt" ) + +for required in libgdal libproj libgeos proj-data; do + if ! grep -q "^${required}" "$OUT_DIR/PACKAGES.txt"; then + echo "ERROR: no ${required}* package in bundle — apt resolver did not pull it as a transitive dep of libgdal-dev. Check PPA contents and rerun." >&2 + exit 1 + fi +done + +# Install libgdal-dev + others into THIS build container only — these +# are needed for the GDAL Python wheel source-compile below. By this +# point step 2 has already collected the RUNTIME debs into the bundle, +# so installing libgdal-dev here doesn't add it to the bundle; this is +# strictly a build-time concern. +$SUDO apt-get install -y \ + "libgdal-dev=${GDAL_PPA_VERSION}" \ + "gdal-bin=${GDAL_PPA_VERSION}" \ + "python3-gdal=${GDAL_PPA_VERSION}" \ + "${EXTRA_SYSTEM_PKGS[@]}" + +# ---- step 3: build wheels in an isolated venv ---------------------------- +# A venv keeps system Python's site-packages out of the resolution graph so +# `pip wheel` produces deterministic outputs against only the pinned set. + +VENV="$(mktemp -d)/venv" +python3 -m venv "$VENV" +# shellcheck disable=SC1091 +source "$VENV/bin/activate" + +# Upgrade pip itself first so the resolver/build-backend code matches the pin. +pip install --no-cache-dir "pip==${PIP_VERSION}" + +# Pip toolchain + numpy: pre-built wheels are fine (no native compile needed +# against system libs). +pip wheel --wheel-dir="$OUT_DIR/wheels" --no-cache-dir \ + "pip==${PIP_VERSION}" \ + "setuptools==${SETUPTOOLS_VERSION}" \ + "wheel==${WHEEL_VERSION}" \ + "cython==${CYTHON_VERSION}" \ + "numpy==${NUMPY_VERSION}" + +# Install the build toolchain into the venv so the GDAL sdist build below +# can use it via --no-build-isolation. Without that flag, pip's PEP 517 +# isolated build environment re-downloads setuptools/wheel/numpy/cython +# into a temp venv per build — which under amd64 emulation on M-series +# Macs costs 30+ minutes of pip-resolver churn. --no-build-isolation tells +# pip "use what I've already installed in this venv instead." We install +# the same pinned versions the scala_build composite action uses. +pip install --no-cache-dir \ + "setuptools==${SETUPTOOLS_VERSION}" \ + "wheel==${WHEEL_VERSION}" \ + "numpy==${NUMPY_VERSION}" \ + "cython==${CYTHON_VERSION}" + +# GDAL: --no-binary :all: forces sdist compile against the libgdal-dev we +# installed in step 2 (signed by the fingerprint-pinned PPA). This is the +# operation the original init script did on every cluster start — here it +# runs once per artifact build. +# +# --no-build-isolation: see note above. Required to keep the M-series +# emulated build under 30 min instead of multi-hour. +# --no-deps: we already pre-installed numpy + cython at the pinned +# versions above. Without this flag, pip's resolver walks numpy sdist +# candidates from PyPI to satisfy the [numpy] extra — and the latest +# numpy sdists use meson-python as their build backend, which is not +# in our venv. The resolver fails with `Cannot import 'mesonpy'` +# before the GDAL compile even starts. +export GDAL_CONFIG=/usr/bin/gdal-config +pip wheel --wheel-dir="$OUT_DIR/wheels" --no-cache-dir --no-build-isolation --no-deps --no-binary :all: \ + "GDAL[numpy]==$(gdal-config --version).*" + +deactivate + +# ---- step 4: native (+ JAR, unless --platform-only) ---------------------- +cp "$JNI_PATH" "$OUT_DIR/libgdalalljni.so" +if [ "$PLATFORM_ONLY" = "false" ]; then + cp "$JAR_PATH" "$OUT_DIR/$(basename "$JAR_PATH")" +fi + +# ---- step 5: SHA256SUMS — the inner trust anchor ------------------------- +# Sort for deterministic output so the same inputs always produce the same +# SHA256SUMS file (useful for audit/diff and for reproducible builds). +# Relative paths so the runtime can `cd && sha256sum -c SHA256SUMS`. +( cd "$OUT_DIR" && find . -type f ! -name SHA256SUMS -print0 \ + | LC_ALL=C sort -z \ + | xargs -0 sha256sum > SHA256SUMS ) + +# ---- step 6: package as a single release tarball + sidecar -------------- +# One tarball at the release-attachment level (GitHub release, internal +# mirror), plus a matching .sha256 sidecar. The cluster init script reads +# the sidecar from the UC Volume at runtime to get the expected hash and +# tarball filename — no hash is hardcoded in the init script, which means +# a security patch or GDAL bump can re-publish (tarball, sidecar) without +# requiring a new init-script release. Trust binds via UC Volume ACLs. + +if [ "$PLATFORM_ONLY" = "true" ]; then + # Platform tarball: name keyed on Ubuntu codename only — version-stable + # across many GeoBrix releases. Bumped only when GDAL_PPA_VERSION changes. + TARBALL_NAME="geobrix-gdal-platform-${CODENAME}.tar.gz" + VERSION="" +else + # Release-shape tarball: include GeoBrix version parsed from the JAR name. + VERSION="$(basename "$JAR_PATH" | sed -nE 's/^geobrix-(.+)-jar-with-dependencies\.jar$/\1/p')" + [ -n "$VERSION" ] || { echo "could not parse geobrix version from JAR name: $(basename "$JAR_PATH")" >&2; exit 1; } + TARBALL_NAME="geobrix-gdal-artifacts-v${VERSION}-${CODENAME}.tar.gz" +fi + +PARENT_DIR="$(dirname "$OUT_DIR")" +BUNDLE_NAME="$(basename "$OUT_DIR")" + +# Use deterministic tar flags so the same inputs produce the same bytes: +# --sort=name for stable entry order, --mtime/--owner/--group to strip +# build-time metadata. Makes the outer SHA256 reproducible across rebuilds +# of the same input set, which is what enables the init-script pin model. +( cd "$PARENT_DIR" && tar \ + --sort=name --mtime='UTC 2020-01-01' --owner=0 --group=0 --numeric-owner \ + -czf "$TARBALL_NAME" "$BUNDLE_NAME" ) + +TARBALL_PATH="$PARENT_DIR/$TARBALL_NAME" +TARBALL_SHA256="$(sha256sum "$TARBALL_PATH" | awk '{print $1}')" +echo "${TARBALL_SHA256} ${TARBALL_NAME}" > "$TARBALL_PATH.sha256" + +# ---- summary ------------------------------------------------------------- +echo +echo "==> Artifact bundle ready: $OUT_DIR" +echo " debs: $(ls "$OUT_DIR/debs" | wc -l) packages" +echo " wheels: $(ls "$OUT_DIR/wheels" | wc -l) wheels" +echo " GDAL: $(gdal-config --version)" +echo +echo "==> PPA-sourced transitive deps captured in bundle (these are intentionally" +echo " the UbuntuGIS PPA versions, not the system Ubuntu versions — they pair" +echo " with the PPA libgdal that was source-linked against them):" +grep -E '^(libgdal|libproj|libgeos|proj-data|libspatialite|libnetcdf|libhdf[45])' "$OUT_DIR/PACKAGES.txt" \ + | sed 's/^/ /' +echo +echo "==> Tarball: $TARBALL_PATH" +echo " size: $(du -h "$TARBALL_PATH" | awk '{print $1}')" +echo " sha256: $TARBALL_SHA256" +echo " arch: amd64 / x86_64 (Intel or AMD) — ARM / aarch64 not supported" +echo + +if [ "$PLATFORM_ONLY" = "true" ]; then + echo "==> --platform-only build. Commit these two files to resources/static/" + echo " (the .tar.gz is tracked by Git LFS — see .gitattributes):" + echo " $TARBALL_NAME" + echo " $TARBALL_NAME.sha256" + echo + echo " The per-release JAR is grafted in at release time by" + echo " .github/workflows/package-geobrix-artifacts.yml." +else + echo "==> Release-shape build. Artifacts to attach to the GeoBrix v${VERSION} release:" + echo " 1. $TARBALL_NAME (the bundle)" + echo " 2. $TARBALL_NAME.sha256 (sidecar — trust anchor)" + echo " 3. scripts/geobrix-gdal-init.sh (init script — versioned with this release)" + echo + echo "==> Operator (per cluster):" + echo " 1. Download all three from the release page." + echo " 2. Verify the tarball: sha256sum -c $TARBALL_NAME.sha256" + echo " 3. Upload tarball + sidecar to UC Volume (NOT the init script):" + echo " databricks fs cp $TARBALL_PATH \\" + echo " dbfs:/Volumes/geospatial_docs/gdal_artifacts/noble/geobrix/ --overwrite" + echo " databricks fs cp $TARBALL_PATH.sha256 \\" + echo " dbfs:/Volumes/geospatial_docs/gdal_artifacts/noble/geobrix/ --overwrite" +fi +echo " 4. Point the cluster's init-script setting at the downloaded" +echo " geobrix-gdal-init.sh (in workspace files or its own volume path)." diff --git a/scripts/geobrix-gdal-init-ppa.sh b/scripts/geobrix-gdal-init-ppa.sh new file mode 100644 index 0000000..7099f7f --- /dev/null +++ b/scripts/geobrix-gdal-init-ppa.sh @@ -0,0 +1,136 @@ +#!/bin/bash +# +# ============================================================================ +# LEGACY PATH — slow cluster start (~15 minutes) +# ============================================================================ +# This script adds the UbuntuGIS PPA, downloads + installs GDAL .debs on every +# cluster boot, and SOURCE-COMPILES the GDAL Python bindings against the +# freshly-installed libgdal-dev. Total cold-start cost on a Databricks cluster +# is typically 10–15 minutes (PPA fetch + apt install + ~5–8 min source build +# of GDAL[numpy] under pip --no-binary :all:). +# +# Prefer scripts/geobrix-gdal-init.sh for new clusters: it installs the same +# fingerprint-verified set of artifacts in 30–90 seconds by pre-staging the +# CI-built bundle in a Unity Catalog Volume. Keep this script around only for: +# - bootstrapping the very first artifact bundle (CI uses this dance to +# produce what gets staged for the tarball script), or +# - troubleshooting a cluster that can't read from the staging volume. +# +# Databricks cluster init script. This file is uploaded to a Workspace +# volume and run by the cluster on boot — the ubuntugis PPA signing key is embedded +# inline below. Keep this file self-contained. + +set -euo pipefail + +sudo add-apt-repository -y "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc)-backports main universe multiverse restricted" +sudo add-apt-repository -y "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc)-updates main universe multiverse restricted" +sudo add-apt-repository -y "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc)-security main multiverse restricted universe" +sudo add-apt-repository -y "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc) main multiverse restricted universe" + +# - add ubuntugis PPA with fingerprint-pinned GPG key. +# We do NOT call `add-apt-repository ppa:ubuntugis/ubuntugis-unstable`: +# that helper auto-installs whatever key Launchpad serves (TOFU). Instead +# the signing key is embedded below and rejected unless its fingerprint +# matches UBUNTUGIS_FPR — so a tampered cluster image, a swapped key +# block in this script, or a Launchpad MITM all fail closed before any +# GDAL package gets pulled through the PPA's signing chain. +# +# Expected fingerprint sourced from Launchpad's signing_key_fingerprint API: +# curl https://launchpad.net/api/1.0/~ubuntugis/+archive/ubuntu/ubuntugis-unstable \ +# | jq -r .signing_key_fingerprint +# Re-verify on key bump and update the embedded block below in lockstep. +UBUNTUGIS_FPR="2EC86B48E6A9F326623CD22FFF0E7BBEC491C6A1" +UBUNTUGIS_KEYRING="/etc/apt/keyrings/ubuntugis.gpg" +UBUNTUGIS_LIST="/etc/apt/sources.list.d/ubuntugis-unstable.list" + +sudo apt-get install -y software-properties-common gpg + +UBUNTUGIS_KEY_ASC="$(mktemp)" +trap 'rm -f "$UBUNTUGIS_KEY_ASC"' EXIT +cat > "$UBUNTUGIS_KEY_ASC" <<'UBUNTUGIS_KEY_EOF' +-----BEGIN PGP PUBLIC KEY BLOCK----- +Comment: Hostname: +Version: Hockeypuck 2.2 + +xsFNBGYzcWcBEACZy6Cs/d6xE5dYOX7MY9nMNGALohNGal+lT/gvuU16NYrXV/qs +7NyOLjUmFuEflrbMbOuqW6XaK8FRCkOCMbJAGcxlieLK7e2oV472rw/fMVJYk9du +ebQoYcNfB4Pylb4xpZvG9+zwWWICMZG8JlcV+hLWAC5L9WY/6GycRZMarukPntY5 +f9r6KMohMtcpiqjtpIccTKbxLwB/wPRTri2+clSG1PABhIhLzQqQv2qIlsVGjt0r +eP1DjoNin0yrBrsNZysVSEQW4/3KEW4PN4VqhoGwrNPygN0dwCyQ/yn+ulFhwzgI +KTGlDkEEn+ozONMIccWjGxck3SCjCCH2QO3UwX10AifChgFoms5mKuE0MLYRqgWK +wPGly5n5yBOhz8ctXRQ7L0613hJ6GiBkZMqOTIdXY4NT52e6tsXTaJ/Jx4VwFg64 +j0qJZ5TE1Z//kSTpEmEELsq0rl3Iz9gxeMqalVhoJXBRKb7MMwJn4p0rjbhp9jWj +4tN26LqwLfCNVPrEomUG7ERG6Rs45CfPOh3bLCm9yd3++bcAGN8ne3F1YABY/kyf +bXtjQ/ihhpFMbqUtcUkEIS8xfbnwdORvH+wmaBbSpaMW1JCJNmM3KsdzY16PsckO +Z7YHAqZacirlNN/dZbsFLow958ssjwgGYquVNhiBckE2vIzObdrcHqsx8QARAQAB +zRtMYXVuY2hwYWQgUFBBIGZvciBVYnVudHVHSVPCwY4EEwEKADgWIQQuyGtI5qnz +JmI80i//Dnu+xJHGoQUCZjNxZwIbAwULCQgHAgYVCgkICwIEFgIDAQIeAQIXgAAK +CRD/Dnu+xJHGoY8RD/9nviKd8w55J7MxUhI3s6ka15BXqKamZ7zmVn+nYNU9QY3V +HK3gh1Z1SytNcS572AZuym1dTGe779zfIchQ6VN8aFwhLTKMyg4FBGP0opYCPEG1 +y2wwcSTNeOyiwPBECYae0tXi9btYB3GswO30GaQXTpKAy0LDaHSm4zfUkKfnofAQ +lZdznTXgxUJqSn8fzFMIY4bDEImgRp1TS5sIavKQKpFLNJKP1bnCl1/YSTm67SOx +rH1Q0URKJIRsgfj/L4Rt1SW8EZqFb9tDHfcfGSpdvD7LWe7NMVYHBn9CUsSMbfW8 +SwBkUAw/6l0ODeKmUNqSbYTia0GBhX/LwsFrc3cydSlX8NZSKwGztM9F+tOHXaS9 +eVap7Ow6dTuaw/fyJIf57PAVSAkmJ41nSAygr4XaleDTJXHE4T0tHWusb3AXdKUR +4bSthlSQKrFnYnLTBKuN5ijQ5TLzFbMjD22JvFpSQeQeGYkjNfmLOcLU1p4pWCM+ +z5EgjOJcGPbjFqlEkMraUPONJuzFdAnx6d7OdGY9TWserSuI8+392mXhU+9SiS8T +nrbb0Y/WYJmcqkQRmwe6eCs7G+3UJhulUKWEYm37255aNiHKJl+FZEgZ9Zh5tsN/ +RrcIov5r9ncdNv8VP6c6IkOCbH9bOo4jto02TV/WMACEcXCVU7nZCdbCYpHCqA== +=cYNc +-----END PGP PUBLIC KEY BLOCK----- +UBUNTUGIS_KEY_EOF + +actual_fpr=$(gpg --show-keys --with-fingerprint --with-colons "$UBUNTUGIS_KEY_ASC" \ + | awk -F: '/^fpr:/ {print $10; exit}') + +if [ -z "$actual_fpr" ] || [ "$actual_fpr" != "$UBUNTUGIS_FPR" ]; then + echo "ubuntugis key fingerprint mismatch: got='${actual_fpr}' expected='${UBUNTUGIS_FPR}'" >&2 + exit 1 +fi + +sudo install -d -m 0755 /etc/apt/keyrings +sudo gpg --dearmor --yes -o "$UBUNTUGIS_KEYRING" < "$UBUNTUGIS_KEY_ASC" +sudo chmod 0644 "$UBUNTUGIS_KEYRING" + +CODENAME="$(lsb_release -sc)" +{ + echo "deb [signed-by=${UBUNTUGIS_KEYRING}] https://ppa.launchpadcontent.net/ubuntugis/ubuntugis-unstable/ubuntu ${CODENAME} main" + echo "deb-src [signed-by=${UBUNTUGIS_KEYRING}] https://ppa.launchpadcontent.net/ubuntugis/ubuntugis-unstable/ubuntu ${CODENAME} main" +} | sudo tee "$UBUNTUGIS_LIST" >/dev/null + +sudo apt-get update -y + +# Update VOL_DIR to point at the Unity Catalog volume where you've staged +# libgdalalljni.so + geobrix-*-jar-with-dependencies.jar before deploying +# this script to a cluster. +VOL_DIR="/Volumes/geospatial_docs/gdal_artifacts/noble/geobrix" +if [ ! -d "$VOL_DIR" ]; then + echo "VOL_DIR not found: $VOL_DIR" >&2 + echo "Edit this script and set VOL_DIR to the volume containing the GeoBrix native + JAR artifacts before re-running." >&2 + exit 1 +fi + +# install natives — keep GDAL_PPA_VERSION in sync with CI (.github/actions/*/action.yml). +# https://gdal.org/en/stable/api/python/python_bindings.html +# https://medium.com/@felipempfreelancer/install-gdal-for-python-on-ubuntu-24-04-9ed65dd39cac +GDAL_PPA_VERSION="3.11.4+dfsg-1~noble0" +sudo apt-get -o DPkg::Lock::Timeout=-1 install -y unixodbc libcurl3-gnutls libsnappy-dev libopenjp2-7 +sudo apt-get -o DPkg::Lock::Timeout=-1 install -y \ + "libgdal-dev=${GDAL_PPA_VERSION}" \ + "gdal-bin=${GDAL_PPA_VERSION}" \ + "python3-gdal=${GDAL_PPA_VERSION}" + +# pip install GDAL (match deps to DBR 17.3 LTS — see release notes for the runtime). +# Bootstrap pins must match .github/actions/{scala,python}_build/action.yml — keep these in sync. +pip install --upgrade pip==25.0.1 setuptools==80.9.0 wheel==0.45.1 cython==3.0.12 # setuptools >= 77.0.0 required for GDAL 3.11+ sdist's PEP 639 SPDX license string +pip install numpy==2.1.3 +export GDAL_CONFIG=/usr/bin/gdal-config +# --no-binary :all: forces sdist compile against the apt-installed libgdal +# headers above (signed by the fingerprint-pinned ubuntugis key), rather +# than accepting whatever pre-built wheel PyPI happens to serve. +pip install --no-cache-dir --no-binary :all: --force-reinstall GDAL[numpy]=="$(gdal-config --version).*" + +# copy JNI and JAR. Quote VOL_DIR so paths with spaces don't break under +# `set -u`; the glob expands after substitution. +cp "$VOL_DIR/libgdalalljni.so" /usr/lib/libgdalalljni.so +cp "$VOL_DIR"/geobrix-*-jar-with-dependencies.jar /databricks/jars diff --git a/scripts/geobrix-gdal-init.sh b/scripts/geobrix-gdal-init.sh old mode 100644 new mode 100755 index 94a7ecc..eeacce9 --- a/scripts/geobrix-gdal-init.sh +++ b/scripts/geobrix-gdal-init.sh @@ -1,120 +1,447 @@ #!/bin/bash # -# Databricks cluster init script. This file is uploaded to a Workspace -# volume and run by the cluster on boot — the ubuntugis PPA signing key is embedded -# inline below. Keep this file self-contained. +# Databricks cluster init script — volume-staged GDAL install for GeoBrix. +# Same security spirit as geobrix-gdal-init-ppa.sh (fingerprint-pinned PPA + +# source-built GDAL bindings), but the slow build runs once in CI and ships +# its outputs as a single tarball attached to each GeoBrix release. Cluster +# start downloads zero bytes from the internet, verifies the bundle against +# its release-published sidecar, extracts to local disk, and installs. +# +# Trust anchors (defense in depth): +# 1. UC Volume ACL — only the release/CI process has write access to +# VOL_DIR. This is the boundary that lets us trust the sidecar found +# there. (Read access to the volume is broader; that's fine.) +# 2. .sha256 sidecar (staged in VOL_DIR) — pins the byte hash of +# the tarball. The init script reads it at runtime and refuses to +# proceed on mismatch, so a tampered tarball fails closed before +# extraction. +# 3. SHA256SUMS inside the tarball — per-file manifest verified +# post-extract. Catches transport corruption + gives an auditable +# per-file pin for forensics. +# 4. UBUNTUGIS_FPR in scripts/build-gdal-artifacts.sh — gates what enters +# the tarball at CI build time. The GPG fingerprint check is upstream +# of every hash above. +# +# Why sidecar (not hardcoded hash in this script): +# The init script is itself a release artifact (attached to each GeoBrix +# release alongside the tarball), so the script+tarball pairing is already +# visible at the release level. The sidecar lets the operator hot-swap a +# re-built bundle (security patch, GDAL bump) without re-cutting an init- +# script release — they just stage the new tarball + new .sha256 in +# VOL_DIR. Trust still binds to the UC Volume ACL. +# +# Architecture: x86_64 / amd64 only — Intel and AMD CPUs are interchangeable +# (`amd64` is just Debian's name for `x86_64`). The exclusion is ARM / +# aarch64 — AWS Graviton, Ampere, Apple Silicon, etc. — because the PPA +# ships only amd64 .debs. Pick a non-ARM instance type for this cluster. +# +# Distribution flow: +# CI: scripts/build-gdal-artifacts.sh → tarball + tarball.sha256 +# Release: both attached to the GeoBrix GitHub release, alongside the +# matching version of this script +# Operator: download both files, upload to VOL_DIR +# Cluster: this script discovers them, verifies, extracts, installs +# +# TROUBLESHOOTING +# +# 1) This script FAILS and the cluster never launches. +# By default, this script writes its full stdout+stderr to a local +# /tmp file and copies that file to VOL_DIR via an EXIT trap on +# script exit. The persistent copy survives the failing cluster's +# teardown. Path layout (per host, fixed filenames): +# $VOL_DIR/_init_logs/$DB_CLUSTER_ID/$(hostname)/init.log +# $VOL_DIR/_init_logs/$DB_CLUSTER_ID/$(hostname)/_NN_*.txt +# On every cluster launch the driver wipes the entire cluster-level +# log dir ($VOL_DIR/_init_logs/$DB_CLUSTER_ID/) — clean slate, no +# stale per-host subdirs from prior launches accumulating. Workers +# leave peer subdirs alone and only clear their own. Per-host +# subdirectories isolate driver vs worker logs (and multiple workers +# from each other) — no risk of two nodes racing on the same path. +# To read everything from any working cluster: +# %sh +# find /Volumes//_init_logs/ -type f +# cat /Volumes//_init_logs///init.log +# No env-var setup needed — reuses the VOL_DIR already configured +# for the platform tarball. +# +# Why the /tmp → cp dance? UC Volumes are S3-backed, and S3 objects +# don't support append. Incremental writes from a long-running `tee` +# process buffer at the FUSE layer and never flush if the host +# terminates before close(). A single bulk cp from /tmp is one +# open/write-all/close cycle that FUSE flushes as a single S3 PUT. +# Caveat: if Databricks SIGKILLs this script (init-script TIMEOUT, +# distinct from the script's own non-zero exit), the trap doesn't +# run and the cp doesn't happen — the local /tmp file dies with the +# node. dpkg failures, sha mismatches, and set-e aborts ALL run the +# trap normally and produce a persistent log. +# +# Override the log location by setting WS_LOG_DIR (env var or hardcoded +# near the top of this script) if you want logs in a Workspace files +# path or a dedicated logs volume instead. +# +# Backstops if VOL_DIR write is denied (the script will log a WARNING +# line via the cluster Event Log indicating this): +# - Databricks UI: cluster page → "Event log" tab → click the +# "Init script failure" event for the last lines of stderr. +# - From a surviving cluster in the same workspace, before the +# failing driver's local files are gone: +# sudo ls /databricks/init_scripts/ +# sudo cat /databricks/init_scripts/*_geobrix-gdal-init.sh.stderr.log +# sudo cat /databricks/init_scripts/*_geobrix-gdal-init.sh.stdout.log +# Common causes and what to do: +# - "no *.tar.gz.sha256 sidecar found in $VOL_DIR" → operator +# forgot to upload the .sha256 sidecar alongside the tarball. +# - sha256sum -c reports "WARNING: 1 computed checksum did NOT match" +# → the tarball +# was truncated or replaced; re-download from the release page, +# verify locally with `sha256sum -c .sha256`, re-upload. +# - "dpkg: error processing ..." → the bundle +# is missing a runtime dep, or a bundled runtime lib conflicts +# with what DBR's base image pre-installs. Rebuild the platform +# tarball locally via scripts/build-gdal-artifacts.sh against a +# fresh ubuntu:24.04 container, smoke-test, and ship a replacement +# release. (Build-time-only packages — `*-dev`, autotools, etc. — +# are filtered out of the bundle by the build script; if dpkg +# complains about one of those, that filter has regressed.) +# +# 2) This script SUCCEEDS but GDAL functions fail at runtime. +# Run in a %sh notebook cell on the launched cluster to find the gap: +# dpkg -l 2>/dev/null | grep -E '^ii\s+(libgdal|libproj|libgeos|gdal-bin|gdal-data|python3-gdal)' +# python -c "from osgeo import gdal; print(gdal.__version__)" +# ldd /usr/lib/x86_64-linux-gnu/libgdal.so.37 | grep 'not found' +# env | grep -i proj +# Any "not found" line in ldd → a runtime shared lib the bundle didn't +# ship. Compare the installed set against PACKAGES.txt inside the +# tarball (extract locally, grep). Fix by adding the missing package +# to scripts/build-gdal-artifacts.sh's EXTRA_SYSTEM_PKGS, rebuilding +# the platform tarball, and recommitting. set -euo pipefail +export DEBIAN_FRONTEND=noninteractive -sudo add-apt-repository -y "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc)-backports main universe multiverse restricted" -sudo add-apt-repository -y "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc)-updates main universe multiverse restricted" -sudo add-apt-repository -y "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc)-security main multiverse restricted universe" -sudo add-apt-repository -y "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc) main multiverse restricted universe" - -# - add ubuntugis PPA with fingerprint-pinned GPG key. -# We do NOT call `add-apt-repository ppa:ubuntugis/ubuntugis-unstable`: -# that helper auto-installs whatever key Launchpad serves (TOFU). Instead -# the signing key is embedded below and rejected unless its fingerprint -# matches UBUNTUGIS_FPR — so a tampered cluster image, a swapped key -# block in this script, or a Launchpad MITM all fail closed before any -# GDAL package gets pulled through the PPA's signing chain. -# -# Expected fingerprint sourced from Launchpad's signing_key_fingerprint API: -# curl https://launchpad.net/api/1.0/~ubuntugis/+archive/ubuntu/ubuntugis-unstable \ -# | jq -r .signing_key_fingerprint -# Re-verify on key bump and update the embedded block below in lockstep. -UBUNTUGIS_FPR="2EC86B48E6A9F326623CD22FFF0E7BBEC491C6A1" -UBUNTUGIS_KEYRING="/etc/apt/keyrings/ubuntugis.gpg" -UBUNTUGIS_LIST="/etc/apt/sources.list.d/ubuntugis-unstable.list" - -sudo apt-get install -y software-properties-common gpg - -UBUNTUGIS_KEY_ASC="$(mktemp)" -trap 'rm -f "$UBUNTUGIS_KEY_ASC"' EXIT -cat > "$UBUNTUGIS_KEY_ASC" <<'UBUNTUGIS_KEY_EOF' ------BEGIN PGP PUBLIC KEY BLOCK----- -Comment: Hostname: -Version: Hockeypuck 2.2 - -xsFNBGYzcWcBEACZy6Cs/d6xE5dYOX7MY9nMNGALohNGal+lT/gvuU16NYrXV/qs -7NyOLjUmFuEflrbMbOuqW6XaK8FRCkOCMbJAGcxlieLK7e2oV472rw/fMVJYk9du -ebQoYcNfB4Pylb4xpZvG9+zwWWICMZG8JlcV+hLWAC5L9WY/6GycRZMarukPntY5 -f9r6KMohMtcpiqjtpIccTKbxLwB/wPRTri2+clSG1PABhIhLzQqQv2qIlsVGjt0r -eP1DjoNin0yrBrsNZysVSEQW4/3KEW4PN4VqhoGwrNPygN0dwCyQ/yn+ulFhwzgI -KTGlDkEEn+ozONMIccWjGxck3SCjCCH2QO3UwX10AifChgFoms5mKuE0MLYRqgWK -wPGly5n5yBOhz8ctXRQ7L0613hJ6GiBkZMqOTIdXY4NT52e6tsXTaJ/Jx4VwFg64 -j0qJZ5TE1Z//kSTpEmEELsq0rl3Iz9gxeMqalVhoJXBRKb7MMwJn4p0rjbhp9jWj -4tN26LqwLfCNVPrEomUG7ERG6Rs45CfPOh3bLCm9yd3++bcAGN8ne3F1YABY/kyf -bXtjQ/ihhpFMbqUtcUkEIS8xfbnwdORvH+wmaBbSpaMW1JCJNmM3KsdzY16PsckO -Z7YHAqZacirlNN/dZbsFLow958ssjwgGYquVNhiBckE2vIzObdrcHqsx8QARAQAB -zRtMYXVuY2hwYWQgUFBBIGZvciBVYnVudHVHSVPCwY4EEwEKADgWIQQuyGtI5qnz -JmI80i//Dnu+xJHGoQUCZjNxZwIbAwULCQgHAgYVCgkICwIEFgIDAQIeAQIXgAAK -CRD/Dnu+xJHGoY8RD/9nviKd8w55J7MxUhI3s6ka15BXqKamZ7zmVn+nYNU9QY3V -HK3gh1Z1SytNcS572AZuym1dTGe779zfIchQ6VN8aFwhLTKMyg4FBGP0opYCPEG1 -y2wwcSTNeOyiwPBECYae0tXi9btYB3GswO30GaQXTpKAy0LDaHSm4zfUkKfnofAQ -lZdznTXgxUJqSn8fzFMIY4bDEImgRp1TS5sIavKQKpFLNJKP1bnCl1/YSTm67SOx -rH1Q0URKJIRsgfj/L4Rt1SW8EZqFb9tDHfcfGSpdvD7LWe7NMVYHBn9CUsSMbfW8 -SwBkUAw/6l0ODeKmUNqSbYTia0GBhX/LwsFrc3cydSlX8NZSKwGztM9F+tOHXaS9 -eVap7Ow6dTuaw/fyJIf57PAVSAkmJ41nSAygr4XaleDTJXHE4T0tHWusb3AXdKUR -4bSthlSQKrFnYnLTBKuN5ijQ5TLzFbMjD22JvFpSQeQeGYkjNfmLOcLU1p4pWCM+ -z5EgjOJcGPbjFqlEkMraUPONJuzFdAnx6d7OdGY9TWserSuI8+392mXhU+9SiS8T -nrbb0Y/WYJmcqkQRmwe6eCs7G+3UJhulUKWEYm37255aNiHKJl+FZEgZ9Zh5tsN/ -RrcIov5r9ncdNv8VP6c6IkOCbH9bOo4jto02TV/WMACEcXCVU7nZCdbCYpHCqA== -=cYNc ------END PGP PUBLIC KEY BLOCK----- -UBUNTUGIS_KEY_EOF - -actual_fpr=$(gpg --show-keys --with-fingerprint --with-colons "$UBUNTUGIS_KEY_ASC" \ - | awk -F: '/^fpr:/ {print $10; exit}') - -if [ -z "$actual_fpr" ] || [ "$actual_fpr" != "$UBUNTUGIS_FPR" ]; then - echo "ubuntugis key fingerprint mismatch: got='${actual_fpr}' expected='${UBUNTUGIS_FPR}'" >&2 - exit 1 +# Unity Catalog volume where the operator stages the release tarball + sidecar. +VOL_DIR="/Volumes/geospatial_docs/gdal_artifacts/noble/geobrix" + +# Persistent logging — survives the failing cluster's teardown. +# +# Defaults to a sibling subdirectory of VOL_DIR (cluster already has +# read access there; write usually works under the same grant). Override +# by setting WS_LOG_DIR as a cluster env var or hardcoding here: +# WS_LOG_DIR=/Workspace/Users/you@example.com/logging/geobrix +# +# Why local /tmp + cp on exit (not tee directly to UC Volume): UC Volume +# is S3-backed. S3 objects don't support append — incremental tee writes +# buffer at the FUSE layer and never flush if the host dies first +# (observed: destination file gets touched but stays empty). A single +# bulk cp on exit is one open/write-all/close cycle that FUSE flushes +# as a single S3 PUT, which works reliably. +WS_LOG_DIR="${WS_LOG_DIR:-$VOL_DIR/_init_logs}" +CLUSTER_ID="${DB_CLUSTER_ID:-no-cluster-id}" +HOSTNAME_LBL="$(hostname)" +# Per-host paths everywhere — both local (avoids any /tmp collision in +# edge cases like sequential init runs on the same node) and persistent +# (driver + workers each write to a clearly-distinct path; no chance of +# two nodes racing on the same S3 object). Path layout: +# $WS_LOG_DIR///init.log (full log) +# $WS_LOG_DIR///_NN_*.txt (step breadcrumbs) +# Filenames are STABLE per host (no timestamp) so each cluster launch +# overwrites the previous launch's logs in place — the directory never +# accumulates a growing set of files. +LOCAL_LOG="/tmp/geobrix-init-${HOSTNAME_LBL}.log" +FINAL_LOG_DIR="$WS_LOG_DIR/$CLUSTER_ID/$HOSTNAME_LBL" +FINAL_LOG="$FINAL_LOG_DIR/init.log" + +# Cluster launch == clean slate. +# +# On the driver only, wipe the entire cluster-level log dir before this +# run starts. That removes stale per-host subdirs left over from prior +# launches of the same cluster — important for autoscaling clusters +# where workers come and go and hostnames vary between launches; without +# the driver-side wipe, those orphaned subdirs would accumulate forever. +# +# Workers do NOT wipe the cluster dir (would race with peer workers and +# with the still-running driver during the initial startup window). On a +# worker, the host-level clear below catches any residue tied to that +# specific hostname. +# +# Detection: DB_IS_DRIVER is set to "TRUE" on the driver, "FALSE" or +# unset on workers — set by Databricks on every cluster-scoped init run. +if [ "${DB_IS_DRIVER:-FALSE}" = "TRUE" ]; then + rm -rf "$WS_LOG_DIR/$CLUSTER_ID" 2>/dev/null || true fi -sudo install -d -m 0755 /etc/apt/keyrings -sudo gpg --dearmor --yes -o "$UBUNTUGIS_KEYRING" < "$UBUNTUGIS_KEY_ASC" -sudo chmod 0644 "$UBUNTUGIS_KEYRING" +mkdir -p "$FINAL_LOG_DIR" 2>/dev/null || true +# Clear this host's prior logs (covers worker autoscale-in scenarios +# where the cluster-level wipe above didn't run on this node). +( cd "$FINAL_LOG_DIR" 2>/dev/null && rm -f -- *.txt *.log ) 2>/dev/null || true +# Also clear any local /tmp leftover so tee starts fresh (without -a). +: > "$LOCAL_LOG" 2>/dev/null || true -CODENAME="$(lsb_release -sc)" -{ - echo "deb [signed-by=${UBUNTUGIS_KEYRING}] https://ppa.launchpadcontent.net/ubuntugis/ubuntugis-unstable/ubuntu ${CODENAME} main" - echo "deb-src [signed-by=${UBUNTUGIS_KEYRING}] https://ppa.launchpadcontent.net/ubuntugis/ubuntugis-unstable/ubuntu ${CODENAME} main" -} | sudo tee "$UBUNTUGIS_LIST" >/dev/null +echo "started at $(date -Iseconds) host=$HOSTNAME_LBL cluster=$CLUSTER_ID pid=$$" \ + > "$FINAL_LOG_DIR/_01_started.txt" 2>/dev/null || true -sudo apt-get update -y +# Step breadcrumbs — small per-step files written inline so we can see +# how far the script got even if the cluster terminates abruptly and +# the EXIT trap is skipped (SIGKILL during cluster teardown, FUSE +# unmount, etc.). Each call writes < 200 B + an explicit `sync` so +# FUSE flushes the PUT to S3 before the next step runs. Without sync, +# steps near the end of the script (written in quick succession right +# before script exit) may sit in the FUSE buffer and never flush +# before the host terminates. +step() { + echo "$(date -Iseconds) step=$1 ${2:-}" \ + > "$FINAL_LOG_DIR/_$1.txt" 2>/dev/null || true + sync 2>/dev/null || true +} + +# EXIT trap copies local /tmp log to VOL_DIR on script exit. Runs on +# normal exit, on `set -e` failures, and on most signals — but NOT on +# SIGKILL. Databricks SIGKILLs init scripts only on init-script TIMEOUT, +# not on script-level non-zero exits, so dpkg/sha failures et al. land +# in the persistent log normally. +cleanup_log() { + # Absolute first action — confirms the trap actually fired, no + # matter what happens after. + : > "$FINAL_LOG_DIR/_99_trap_pinged.txt" 2>/dev/null || true + + EXIT_CODE=$? + + echo "trap entered exit=$EXIT_CODE at $(date -Iseconds)" \ + > "$FINAL_LOG_DIR/_99_trap_entered.txt" 2>/dev/null || true + + echo "--- Init script finished at $(date -Iseconds) (exit $EXIT_CODE) ---" + # Let tee flush its last bytes to /tmp before the bulk cp. + sleep 0.5 + + CP_STATUS="cp not attempted" + if cp "$LOCAL_LOG" "$FINAL_LOG" 2>&1; then + CP_STATUS="cp succeeded; bytes=$(stat -c%s "$FINAL_LOG" 2>/dev/null || echo unknown)" + echo "--- Log copied to: $FINAL_LOG ---" + else + CP_STATUS="cp FAILED (exit $?)" + echo "--- WARNING: cp '$LOCAL_LOG' '$FINAL_LOG' failed ---" >&2 + fi + + echo "$CP_STATUS at $(date -Iseconds)" \ + > "$FINAL_LOG_DIR/_99_trap_done.txt" 2>/dev/null || true + + # Clean up the temp extract dir if it exists. Previously this had + # its own `trap 'rm -rf $WORK_DIR' EXIT` which silently OVERWROTE + # this cleanup_log trap and threw away the persistent log. Doing + # it inline here keeps both jobs on one trap handler. + if [ -n "${WORK_DIR:-}" ] && [ -d "${WORK_DIR:-}" ]; then + rm -rf "$WORK_DIR" 2>/dev/null || true + fi + + # Encourage FUSE to flush the local S3-backed cache to the underlying + # object store before the host terminates. sync is best-effort on + # FUSE but cheap; the sleep is what gives FUSE time to actually do + # the PUT — Databricks may terminate the host within seconds of + # init-script exit. + sync 2>/dev/null || true + sleep 3 + + exit $EXIT_CODE +} +trap cleanup_log EXIT + +# Mirror all stdout+stderr to the local /tmp file. The trap above is +# what persists this to VOL_DIR; tee here only handles the script→file +# pipe, and tee's original stdout is still the parent's stdout, so the +# Databricks Event Log capture continues to surface output in parallel. +exec > >(tee -a "$LOCAL_LOG") 2>&1 + +echo "--- Init script started at $(date -Iseconds) ---" +echo "Cluster ID: $CLUSTER_ID" +echo "Hostname: $(hostname)" +echo "Local log: $LOCAL_LOG" +echo "Final log: $FINAL_LOG" + +# ---- preflight ----------------------------------------------------------- + +# Refuse to run on ARM. The bundled .debs are amd64 (a.k.a. x86_64) only — +# proceeding on aarch64 would silently install nothing useful (dpkg would +# reject every package) and the failure mode would be confusing. Intel and +# AMD CPUs both report `amd64` here and are equally supported. +step 02_preflight +ARCH="$(dpkg --print-architecture)" +if [ "$ARCH" != "amd64" ]; then + echo "Unsupported architecture: $ARCH" >&2 + echo "The GeoBrix GDAL bundle ships amd64 / x86_64 .debs only (Intel or AMD CPUs)." >&2 + echo "ARM-based instance types — AWS Graviton, Ampere, Apple Silicon — are not supported." >&2 + echo "Choose a non-ARM instance type for this cluster." >&2 + exit 1 +fi + +# Pre-empt apt-daily timers so we don't sit on /var/lib/dpkg/lock-frontend +# while unattended-upgrades runs its boot-time pass. The PPA script used +# DPkg::Lock::Timeout=-1, which made lock contention invisible — here we +# just take the lock out of contention entirely. +sudo systemctl stop --no-block \ + apt-daily.service apt-daily-upgrade.service unattended-upgrades 2>/dev/null || true -# Update VOL_DIR to point at the Unity Catalog volume where you've staged -# libgdalalljni.so + geobrix-*-jar-with-dependencies.jar before deploying -# this script to a cluster. -VOL_DIR="/Volumes/geospatial_docs/gdal_artifacts/noble/geobrix" if [ ! -d "$VOL_DIR" ]; then echo "VOL_DIR not found: $VOL_DIR" >&2 - echo "Edit this script and set VOL_DIR to the volume containing the GeoBrix native + JAR artifacts before re-running." >&2 + echo "Upload the release tarball + matching .sha256 sidecar to $VOL_DIR before running." >&2 exit 1 fi -# install natives — keep GDAL_PPA_VERSION in sync with CI (.github/actions/*/action.yml). -# https://gdal.org/en/stable/api/python/python_bindings.html -# https://medium.com/@felipempfreelancer/install-gdal-for-python-on-ubuntu-24-04-9ed65dd39cac -GDAL_PPA_VERSION="3.11.4+dfsg-1~noble0" -sudo apt-get -o DPkg::Lock::Timeout=-1 install -y unixodbc libcurl3-gnutls libsnappy-dev libopenjp2-7 -sudo apt-get -o DPkg::Lock::Timeout=-1 install -y \ - "libgdal-dev=${GDAL_PPA_VERSION}" \ - "gdal-bin=${GDAL_PPA_VERSION}" \ - "python3-gdal=${GDAL_PPA_VERSION}" - -# pip install GDAL (match deps to DBR 17.3 LTS — see release notes for the runtime). -# Bootstrap pins must match .github/actions/{scala,python}_build/action.yml — keep these in sync. -pip install --upgrade pip==25.0.1 setuptools==74.0.0 wheel==0.45.1 cython==3.0.12 -pip install numpy==2.1.3 -export GDAL_CONFIG=/usr/bin/gdal-config -# --no-binary :all: forces sdist compile against the apt-installed libgdal -# headers above (signed by the fingerprint-pinned ubuntugis key), rather -# than accepting whatever pre-built wheel PyPI happens to serve. -pip install --no-cache-dir --no-binary :all: --force-reinstall GDAL[numpy]=="$(gdal-config --version).*" - -# copy JNI and JAR. Quote VOL_DIR so paths with spaces don't break under -# `set -u`; the glob expands after substitution. -cp "$VOL_DIR/libgdalalljni.so" /usr/lib/libgdalalljni.so -cp "$VOL_DIR"/geobrix-*-jar-with-dependencies.jar /databricks/jars +cd "$VOL_DIR" + +# ---- discover bundle from sidecar --------------------------------------- +# The release ships geobrix-gdal-artifacts-vX.Y.Z-noble.tar.gz + .sha256; +# the operator uploads both, unchanged, to VOL_DIR. We glob for the .sha256 +# sidecar (expect exactly one) and let `sha256sum -c` read the tarball name +# out of it. This decouples script-version from bundle-version: a security +# patch can re-stage a new bundle without an init-script change. + +shopt -s nullglob +sidecars=(*.tar.gz.sha256) +case "${#sidecars[@]}" in + 0) + echo "no *.tar.gz.sha256 sidecar found in $VOL_DIR" >&2 + echo "Stage the GeoBrix release tarball and its matching .sha256 file in this volume." >&2 + exit 1 + ;; + 1) SIDECAR="${sidecars[0]}" ;; + *) + echo "multiple *.tar.gz.sha256 sidecars in $VOL_DIR — expected exactly one active bundle." >&2 + echo "Remove the older sidecar(s) so this script knows which bundle to install:" >&2 + printf ' %s\n' "${sidecars[@]}" >&2 + exit 1 + ;; +esac + +# The sidecar's standard ` ` line names the tarball; pull +# it out so we have a handle for extraction and error messages below. +ARTIFACT_TARBALL="$(awk 'NR==1 {sub(/^\*/, "", $2); print $2}' "$SIDECAR")" +if [ -z "$ARTIFACT_TARBALL" ] || [ ! -f "$ARTIFACT_TARBALL" ]; then + echo "sidecar $SIDECAR references tarball '$ARTIFACT_TARBALL' but it's not present in $VOL_DIR" >&2 + exit 1 +fi + +# ---- verify tarball ------------------------------------------------------ +# Outer trust anchor: tarball must match the sidecar staged alongside it. +# Trust binds via UC Volume ACLs — the sidecar is only present here if the +# release/CI process put it there. +step 03_sidecar_resolved "$ARTIFACT_TARBALL" +echo "==> Verifying $ARTIFACT_TARBALL against $SIDECAR..." +sha256sum -c "$SIDECAR" +step 04_outer_sha256_ok + +# ---- extract ------------------------------------------------------------- +# Extract to local /tmp rather than working off the FUSE-mounted volume. +# VOL_DIR sequential reads are fine for the tarball; per-file random I/O +# during dpkg/pip runs faster against local disk. +WORK_DIR="$(mktemp -d -t geobrix-gdal-XXXXXX)" +# Do NOT install a new EXIT trap here — that would override the +# cleanup_log trap installed at the top of the script, throwing away +# the persistent-log behavior. cleanup_log() now rms WORK_DIR itself +# (see the rm line at the bottom of that function). +tar -xzf "$ARTIFACT_TARBALL" -C "$WORK_DIR" --strip-components=1 +step 05_extracted + +cd "$WORK_DIR" + +# Inner trust anchor: per-file manifest. The outer hash already proved the +# tarball is what CI built; this confirms the extraction wasn't corrupted +# and gives a per-file pin that's useful for forensic comparison later. +if [ ! -f SHA256SUMS ]; then + echo "SHA256SUMS missing inside tarball — bundle is malformed." >&2 + exit 1 +fi +sha256sum -c SHA256SUMS +step 06_inner_sha256_ok + +# ---- install .debs ------------------------------------------------------- +# Install everything the build script staged in one dpkg invocation. dpkg +# satisfies intra-set deps regardless of file order via its two-pass +# unpack-then-configure flow. +# +# Security: the .debs were resolved through the UbuntuGIS PPA whose key +# fingerprint was verified at build time in scripts/build-gdal-artifacts.sh +# (UBUNTUGIS_FPR), and then SHA256-pinned via the bundle's inner +# SHA256SUMS manifest which we verified at step 06_inner_sha256_ok. +# No bytes installed here came from an unverified source. +# +# Deliberately NO `|| apt-get install -fy` fallback. The fallback would +# reach out to whatever apt sources the cluster has configured (default +# Ubuntu archives, possibly the UbuntuGIS PPA itself) and could silently +# install or remove packages — both of which defeat the SHA256-pinned +# trust model. If dpkg fails here, the right response is to fix the +# bundle in scripts/build-gdal-artifacts.sh and re-release, not paper +# over the failure at install time. +step 07_about_to_dpkg +sudo dpkg -i debs/*.deb +step 08_dpkg_done + +# ---- install Python bindings -------------------------------------------- +# Security: every pip install on this cluster reads ONLY from the bundle's +# wheels/ directory (--find-links + --no-index forbids any PyPI lookup, +# --no-cache-dir forbids any stale wheel reuse). The wheels themselves +# were SHA256-pinned in the bundle's inner SHA256SUMS manifest which we +# verified at step 06_inner_sha256_ok. End-to-end the bytes installed +# trace back to the PR that committed the platform tarball into +# resources/static/ — satisfies the project's hash-pinned Python policy +# (see docs/docs/security.mdx "Hash-pinned Python dependencies"). +# +# Bootstrap the pip toolchain from staged wheels first. Specific versions +# are determined by the wheel filenames in the bundle (pinned at build +# time by PIP_VERSION / SETUPTOOLS_VERSION / etc. in build-gdal-artifacts.sh). +pip install --upgrade --no-index --no-cache-dir --find-links=wheels/ \ + pip setuptools wheel cython numpy +step 09_pip_toolchain_done + +# CI built this GDAL wheel with --no-binary :all: against the libgdal-dev +# headers from the fingerprint-pinned PPA, so the bytes verified above are +# exactly the bindings we want. --no-deps because numpy is already installed +# (pip would otherwise re-resolve numpy from PyPI via the [numpy] extra, +# which --no-index already blocks but --no-deps makes redundant-safe). +pip install --force-reinstall --no-index --no-deps --no-cache-dir --find-links=wheels/ GDAL +step 10_gdal_wheel_done + +# Note: the GeoBrix Python wheel (dblabs_geobrix-*-py3-none-any.whl) is +# NOT installed here. It's attached separately as a cluster Library via +# the Databricks UI (Cluster → Libraries → Install new → Upload Python +# Whl). Keeping it out of this script lets the Python wheel be versioned +# and bumped independently of the GDAL platform tarball — bump GeoBrix +# without touching the init script. + +# ---- native + JAR -------------------------------------------------------- +cp libgdalalljni.so /usr/lib/libgdalalljni.so +step 11_jni_copied + +# The GeoBrix JAR can come from one of two places: +# 1. Bundled inside the tarball (release path) — package-geobrix-artifacts.yml +# grafts it in, so the cluster gets it via SHA256SUMS-verified extraction. +# 2. Staged alongside the tarball in VOL_DIR (operator path) — useful for +# smoke-testing a fresh platform tarball before it's baked into a +# release, and for hot-swapping a JAR without rebuilding the tarball. +# Trust here devolves to the same UC Volume write ACL that protects the +# tarball + sidecar — a write to VOL_DIR is already privileged. +# If neither has a JAR, log clearly and proceed — the GDAL stack still +# installs; GeoBrix functions just won't be registered until a JAR is +# supplied. Exiting non-zero here would force every smoke test to ship a +# JAR even when only the platform layer is under test. +if compgen -G "geobrix-*-jar-with-dependencies.jar" > /dev/null; then + echo "==> Installing GeoBrix JAR from bundle." + cp geobrix-*-jar-with-dependencies.jar /databricks/jars/ +elif compgen -G "$VOL_DIR/geobrix-*-jar-with-dependencies.jar" > /dev/null; then + echo "==> No JAR in bundle; using JAR from VOL_DIR." + cp "$VOL_DIR"/geobrix-*-jar-with-dependencies.jar /databricks/jars/ +else + echo "==> WARNING: no GeoBrix JAR in bundle or VOL_DIR; GDAL stack installed but GeoBrix functions will not be available until a JAR is staged." >&2 +fi +step 12_jar_done + +# ---- log installed PPA-sourced versions --------------------------------- +# Goes to the init-script stdout log so you can later confirm a given +# cluster is on the libproj/libgeos/etc. set captured in this bundle. +# Compare against PACKAGES.txt inside the tarball if you ever need to +# audit version drift after the fact. +echo "==> Installed PPA-sourced versions on cluster:" +dpkg-query -W -f='${Package} ${Version}\n' 2>/dev/null \ + | grep -E '^(libgdal|libproj|libgeos|proj-data|proj-bin|python3-gdal|gdal-bin|libspatialite|libnetcdf|libhdf[45])' \ + | LC_ALL=C sort \ + | sed 's/^/ /' +step 13_script_complete