From 5ab4477c1c03195dfbd8eb56a3d16cc3c903e0b7 Mon Sep 17 00:00:00 2001 From: Harrison Crosse Date: Wed, 18 Mar 2026 15:04:51 -0400 Subject: [PATCH 1/2] fix(parquet): strip repetition_type from root SchemaElement during serialization The Parquet spec states the root of the schema does not have a repetition_type. arrow-go was writing REPEATED for the root SchemaElement in the Thrift footer, which is non-standard and breaks interoperability with consumers like Snowflake. Strip RepetitionType from the root element in ToThrift(), matching parquet-java and arrow-rs behavior. The in-memory representation and WithRootRepetition API are unaffected. FromParquet already tolerates a nil root repetition type, so this is backwards-compatible for both readers and writers. Closes #722 --- parquet/schema/schema.go | 1 + parquet/schema/schema_flatten_test.go | 21 ++++++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/parquet/schema/schema.go b/parquet/schema/schema.go index 6d124eb17..3ff376890 100644 --- a/parquet/schema/schema.go +++ b/parquet/schema/schema.go @@ -272,6 +272,7 @@ func (t *toThriftVisitor) VisitPost(Node) {} func ToThrift(schema *GroupNode) []*format.SchemaElement { t := &toThriftVisitor{make([]*format.SchemaElement, 0)} schema.Visit(t) + t.elements[0].RepetitionType = nil return t.elements } diff --git a/parquet/schema/schema_flatten_test.go b/parquet/schema/schema_flatten_test.go index ecbb431c2..a39391610 100644 --- a/parquet/schema/schema_flatten_test.go +++ b/parquet/schema/schema_flatten_test.go @@ -92,8 +92,10 @@ func (s *SchemaFlattenSuite) TestDecimalMetadata() { func (s *SchemaFlattenSuite) TestNestedExample() { elements := make([]*format.SchemaElement, 0) + root := NewGroup(s.name, format.FieldRepetitionType_REPEATED, 2 /* numChildren */, 0 /* fieldID */) + root.RepetitionType = nil elements = append(elements, - NewGroup(s.name, format.FieldRepetitionType_REPEATED, 2 /* numChildren */, 0 /* fieldID */), + root, NewPrimitive("a" /* name */, format.FieldRepetitionType_REQUIRED, format.Type_INT32, 1 /* fieldID */), NewGroup("bag" /* name */, format.FieldRepetitionType_OPTIONAL, 1 /* numChildren */, 2 /* fieldID */)) @@ -120,6 +122,23 @@ func TestSchemaFlatten(t *testing.T) { suite.Run(t, new(SchemaFlattenSuite)) } +func TestToThriftRootRepetitionStripped(t *testing.T) { + for _, rep := range []parquet.Repetition{ + parquet.Repetitions.Repeated, + parquet.Repetitions.Required, + parquet.Repetitions.Optional, + } { + group := MustGroup(NewGroupNode("schema", rep, FieldList{ + NewInt32Node("a", parquet.Repetitions.Required, -1), + }, -1)) + elements := ToThrift(group) + assert.False(t, elements[0].IsSetRepetitionType(), + "root element should not have repetition_type set (was %v)", rep) + assert.True(t, elements[1].IsSetRepetitionType(), + "non-root element must have repetition_type set") + } +} + func TestInvalidConvertedTypeInDeserialize(t *testing.T) { n := MustPrimitive(NewPrimitiveNodeLogical("string" /* name */, parquet.Repetitions.Required, StringLogicalType{}, parquet.Types.ByteArray, -1 /* type len */, -1 /* fieldID */)) From 3ef263efc74d447f393840b8a02d848b27f9fb6b Mon Sep 17 00:00:00 2001 From: Harrison Crosse Date: Wed, 18 Mar 2026 15:17:41 -0400 Subject: [PATCH 2/2] fix(parquet): update expected byte counts after root repetition stripping Stripping the root SchemaElement's repetition_type removes 2 bytes from the serialized Thrift footer (field header + enum value). --- parquet/pqarrow/file_writer_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/pqarrow/file_writer_test.go b/parquet/pqarrow/file_writer_test.go index d8503ccb5..32713cd2f 100644 --- a/parquet/pqarrow/file_writer_test.go +++ b/parquet/pqarrow/file_writer_test.go @@ -172,7 +172,7 @@ func TestFileWriterTotalBytes(t *testing.T) { // Verify total bytes & compressed bytes are correct assert.Equal(t, int64(408), writer.TotalCompressedBytes()) - assert.Equal(t, int64(912), writer.TotalBytesWritten()) + assert.Equal(t, int64(910), writer.TotalBytesWritten()) } func TestFileWriterTotalBytesBuffered(t *testing.T) { @@ -206,5 +206,5 @@ func TestFileWriterTotalBytesBuffered(t *testing.T) { // Verify total bytes & compressed bytes are correct assert.Equal(t, int64(596), writer.TotalCompressedBytes()) - assert.Equal(t, int64(1308), writer.TotalBytesWritten()) + assert.Equal(t, int64(1306), writer.TotalBytesWritten()) }