From edf7f3bc42bad0733f97aafb76cfb48f7bd7dff7 Mon Sep 17 00:00:00 2001 From: Jia Yu Date: Wed, 6 May 2026 21:19:06 -0700 Subject: [PATCH 1/2] [GH-2886] Recognize Box2D columns as GeoParquet bbox covering columns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a user has a Box2D-typed column in the schema being written, both the explicit covering option (geoparquet.covering[.geom]=) and the auto-detect _bbox path now register it in GeoParquet metadata as the bbox covering column for the associated geometry. Box2DUDT.sqlType is struct — the exact shape required by the GeoParquet 1.1 covering spec — so no data-path change is needed: the existing UDT->struct fallback already serializes correctly. Only the metadata path needed to learn that a UDT-wrapped struct is a valid covering source. Float32 + conservative outward rounding (Math.nextUp/Math.nextDown, matching apache/sedona-db's next_after) is intentionally deferred to pair with reader-side auto-materialization of covering columns as Box2D — without that pairing, writing Float32 would create a write/ read asymmetry where reads come back as struct instead of Box2D, regressing user-visible types. Closes #2886. --- .../geoparquet/GeoParquetMetaData.scala | 9 ++++ .../apache/sedona/sql/geoparquetIOTests.scala | 47 +++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetMetaData.scala b/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetMetaData.scala index 7517b8e5fc4..a1ba02ee313 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetMetaData.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetMetaData.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.datasources.geoparquet import scala.util.control.NonFatal +import org.apache.spark.sql.sedona_sql.UDT.Box2DUDT import org.apache.spark.sql.types.{DoubleType, FloatType, StructType} import org.datasyslab.proj4sedona.core.Proj import org.datasyslab.proj4sedona.parser.CRSSerializer @@ -236,6 +237,14 @@ object GeoParquetMetaData { schema(coveringColumnIndex).dataType match { case coveringColumnType: StructType => coveringColumnTypeToCovering(coveringColumnName, coveringColumnType) + case _: Box2DUDT => + // Box2DUDT exposes a struct sqlType, which is the exact + // shape required by GeoParquet 1.1 bbox covering columns. Treat the underlying struct as + // the covering struct so users can write a Box2D column and have it referenced as a + // covering column in GeoParquet metadata without any manual struct construction. + coveringColumnTypeToCovering( + coveringColumnName, + new Box2DUDT().sqlType.asInstanceOf[StructType]) case _ => throw new IllegalArgumentException( s"Covering column $coveringColumnName is not a struct type") diff --git a/spark/common/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala b/spark/common/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala index 5d2a13f4867..2924292cd5f 100644 --- a/spark/common/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala +++ b/spark/common/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala @@ -1021,6 +1021,53 @@ class geoparquetIOTests extends TestBaseScala with BeforeAndAfterAll { } } + it("GeoParquet supports writing covering metadata from a Box2D column") { + // User-provided Box2D column referenced via the geoparquet.covering option. + val df = sparkSession + .range(0, 100) + .toDF("id") + .withColumn("id", expr("CAST(id AS DOUBLE)")) + .withColumn("geometry", expr("ST_Point(id, id + 1)")) + .withColumn("test_cov", expr("ST_Box2D(geometry)")) + val geoParquetSavePath = geoparquetoutputlocation + "/gp_with_box2d_covering.parquet" + df.write + .format("geoparquet") + .option("geoparquet.covering.geometry", "test_cov") + .mode("overwrite") + .save(geoParquetSavePath) + validateGeoParquetMetadata(geoParquetSavePath) { geo => + implicit val formats: org.json4s.Formats = org.json4s.DefaultFormats + val coveringJsValue = geo \ "columns" \ "geometry" \ "covering" + val covering = coveringJsValue.extract[Covering] + assert(covering.bbox.xmin == Seq("test_cov", "xmin")) + assert(covering.bbox.ymin == Seq("test_cov", "ymin")) + assert(covering.bbox.xmax == Seq("test_cov", "xmax")) + assert(covering.bbox.ymax == Seq("test_cov", "ymax")) + } + } + + it("GeoParquet auto populates covering metadata for a Box2D _bbox column") { + // Auto-detect path: when a column named _bbox is a Box2D, reuse it as the + // covering column instead of synthesizing a separate float64 struct. + val df = sparkSession + .range(0, 100) + .toDF("id") + .withColumn("id", expr("CAST(id AS DOUBLE)")) + .withColumn("geometry", expr("ST_Point(id, id + 1)")) + .withColumn("geometry_bbox", expr("ST_Box2D(geometry)")) + val geoParquetSavePath = geoparquetoutputlocation + "/gp_box2d_auto_covering.parquet" + df.write.format("geoparquet").mode("overwrite").save(geoParquetSavePath) + validateGeoParquetMetadata(geoParquetSavePath) { geo => + implicit val formats: org.json4s.Formats = org.json4s.DefaultFormats + val coveringJsValue = geo \ "columns" \ "geometry" \ "covering" + val covering = coveringJsValue.extract[Covering] + assert(covering.bbox.xmin == Seq("geometry_bbox", "xmin")) + assert(covering.bbox.ymin == Seq("geometry_bbox", "ymin")) + assert(covering.bbox.xmax == Seq("geometry_bbox", "xmax")) + assert(covering.bbox.ymax == Seq("geometry_bbox", "ymax")) + } + } + it("GeoParquet auto populates covering metadata for single geometry column") { val df = sparkSession .range(0, 100) From 776c5b934cf912fcaf760d2076ed29874fe1f536 Mon Sep 17 00:00:00 2001 From: Jia Yu Date: Thu, 7 May 2026 21:35:04 -0700 Subject: [PATCH 2/2] Use Box2DUDT instance via bound case in covering metadata Avoids the per-call allocation of new Box2DUDT() and the asInstanceOf[StructType] cast. Falls back to a clear IllegalStateException if Box2DUDT.sqlType ever changes shape. --- .../datasources/geoparquet/GeoParquetMetaData.scala | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetMetaData.scala b/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetMetaData.scala index a1ba02ee313..4068b3f69a8 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetMetaData.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetMetaData.scala @@ -237,14 +237,18 @@ object GeoParquetMetaData { schema(coveringColumnIndex).dataType match { case coveringColumnType: StructType => coveringColumnTypeToCovering(coveringColumnName, coveringColumnType) - case _: Box2DUDT => + case udt: Box2DUDT => // Box2DUDT exposes a struct sqlType, which is the exact // shape required by GeoParquet 1.1 bbox covering columns. Treat the underlying struct as // the covering struct so users can write a Box2D column and have it referenced as a // covering column in GeoParquet metadata without any manual struct construction. - coveringColumnTypeToCovering( - coveringColumnName, - new Box2DUDT().sqlType.asInstanceOf[StructType]) + udt.sqlType match { + case structType: StructType => + coveringColumnTypeToCovering(coveringColumnName, structType) + case other => + throw new IllegalStateException( + s"Box2DUDT.sqlType is expected to be a StructType, got $other") + } case _ => throw new IllegalArgumentException( s"Covering column $coveringColumnName is not a struct type")