Skip to content
Merged
56 changes: 33 additions & 23 deletions parquet/file/column_writer_types.gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

56 changes: 33 additions & 23 deletions parquet/file/column_writer_types.gen.go.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -430,39 +430,49 @@ func (w *{{.Name}}ColumnChunkWriter) writeBitmapValues(bitmap []byte, bitmapOffs
w.currentEncoder.(encoding.BooleanEncoder).Put(values)
}

// Note: Statistics and bloom filter updates would require converting back to []bool
// For now, skip them to maintain the performance benefit
// In production, we'd need bitmap-aware statistics/bloom filter methods
// Update statistics and bloom filter using bitmap-aware methods
if w.pageStatistics != nil {
// Convert for statistics (unavoidable for now)
values := make([]bool, numValues)
for i := int64(0); i < numValues; i++ {
values[i] = bitutil.BitIsSet(bitmap, int(bitmapOffset+i))
}
w.pageStatistics.(*metadata.BooleanStatistics).Update(values, numNulls)
w.pageStatistics.(*metadata.BooleanStatistics).UpdateFromBitmap(bitmap, bitmapOffset, numValues, numNulls)
}
if w.bloomFilter != nil {
// Convert for bloom filter (unavoidable for now)
values := make([]bool, numValues)
for i := int64(0); i < numValues; i++ {
values[i] = bitutil.BitIsSet(bitmap, int(bitmapOffset+i))
}
w.bloomFilter.InsertBulk(metadata.GetHashes(w.bloomFilter.Hasher(), values))
w.bloomFilter.InsertBulk(metadata.GetHashesFromBitmap(w.bloomFilter.Hasher(), bitmap, bitmapOffset, numValues))
}
}

// writeBitmapValuesSpaced writes boolean values from a bitmap with validity information
func (w *{{.Name}}ColumnChunkWriter) writeBitmapValuesSpaced(bitmap []byte, bitmapOffset int64, numRead, numValues int64, validBits []byte, validBitsOffset int64) {
// For spaced writes, we need to compress the bitmap according to validity
// This requires converting to []bool for now
// A future optimization could implement bitmap-to-bitmap compression
spacedValues := make([]bool, numValues)
for i := int64(0); i < numValues; i++ {
spacedValues[i] = bitutil.BitIsSet(bitmap, int(bitmapOffset+i))
// Try to use bitmap-aware encoder interface if available
type bitmapSpacedEncoder interface {
PutSpacedBitmap(bitmap []byte, bitmapOffset int64, numValues int64, validBits []byte, validBitsOffset int64) int64
}

// Use existing spaced write logic
w.writeValuesSpaced(spacedValues, numRead, numValues, validBits, validBitsOffset)
if enc, ok := w.currentEncoder.(bitmapSpacedEncoder); ok {
// Direct bitmap path - no []bool conversion!
enc.PutSpacedBitmap(bitmap, bitmapOffset, numValues, validBits, validBitsOffset)
} else {
// Fallback: convert to []bool for encoders that don't support bitmap interface
spacedValues := make([]bool, numValues)
for i := int64(0); i < numValues; i++ {
spacedValues[i] = bitutil.BitIsSet(bitmap, int(bitmapOffset+i))
}

if len(spacedValues) != int(numRead) {
w.currentEncoder.(encoding.BooleanEncoder).PutSpaced(spacedValues, validBits, validBitsOffset)
} else {
w.currentEncoder.(encoding.BooleanEncoder).Put(spacedValues)
}
}

// Use bitmap-aware statistics update
if w.pageStatistics != nil {
nulls := numValues - numRead
w.pageStatistics.(*metadata.BooleanStatistics).UpdateFromBitmapSpaced(bitmap, bitmapOffset, numValues, validBits, validBitsOffset, nulls)
}

// Use bitmap-aware bloom filter hashing
if w.bloomFilter != nil {
w.bloomFilter.InsertBulk(metadata.GetSpacedHashesFromBitmap(w.bloomFilter.Hasher(), numRead, bitmap, bitmapOffset, numValues, validBits, validBitsOffset))
}
}

{{end}}
Expand Down
97 changes: 97 additions & 0 deletions parquet/internal/encoding/boolean_encoder.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"encoding/binary"

"github.com/apache/arrow-go/v18/arrow/bitutil"
"github.com/apache/arrow-go/v18/internal/bitutils"
"github.com/apache/arrow-go/v18/parquet"
"github.com/apache/arrow-go/v18/parquet/internal/debug"
"github.com/apache/arrow-go/v18/parquet/internal/utils"
Expand All @@ -30,6 +31,41 @@ const (
boolsInBuf = boolBufSize * 8
)

// compressBitmapWithValidity extracts only the valid bits from a source bitmap,
// compressing it into a contiguous destination bitmap. Uses SetBitRunReader for
// efficient iteration over valid runs.
func compressBitmapWithValidity(
srcBitmap []byte,
srcOffset int64,
numValues int64,
validBits []byte,
validBitsOffset int64,
numValid int64,
) []byte {
if numValid == 0 {
return []byte{}
}

// Allocate destination bitmap to hold only valid bits
dstBitmap := make([]byte, bitutil.BytesForBits(numValid))
dstWriter := utils.NewBitmapWriter(dstBitmap, 0, int(numValid))

// Use SetBitRunReader to efficiently iterate over valid runs
reader := bitutils.NewSetBitRunReader(validBits, validBitsOffset, numValues)
for {
run := reader.NextRun()
if run.Length == 0 {
break
}

// Copy this run of valid bits from source to destination
dstWriter.AppendBitmap(srcBitmap, srcOffset+run.Pos, run.Length)
}

dstWriter.Finish()
return dstBitmap
}

// PlainBooleanEncoder encodes bools as a bitmap as per the Plain Encoding
type PlainBooleanEncoder struct {
encoder
Expand Down Expand Up @@ -99,6 +135,29 @@ func (enc *PlainBooleanEncoder) PutSpaced(in []bool, validBits []byte, validBits
enc.Put(bufferOut[:nvalid])
}

// PutSpacedBitmap encodes boolean values directly from a bitmap with validity information,
// without converting to []bool. This avoids the 8x memory overhead of bool slices.
// It compresses the bitmap by extracting only valid (non-null) bits.
func (enc *PlainBooleanEncoder) PutSpacedBitmap(bitmap []byte, bitmapOffset int64, numValues int64, validBits []byte, validBitsOffset int64) int64 {
if numValues == 0 {
return 0
}

// Count the number of valid values to pre-allocate destination bitmap
numValid := int64(bitutil.CountSetBits(validBits, int(validBitsOffset), int(numValues)))
if numValid == 0 {
return 0
}

// Compress bitmap: extract only valid bits
compressedBitmap := compressBitmapWithValidity(bitmap, bitmapOffset, numValues, validBits, validBitsOffset, numValid)

// Encode the compressed bitmap
enc.PutBitmap(compressedBitmap, 0, numValid)

return numValid
}

// EstimatedDataEncodedSize returns the current number of bytes that have
// been buffered so far
func (enc *PlainBooleanEncoder) EstimatedDataEncodedSize() int64 {
Expand Down Expand Up @@ -140,6 +199,44 @@ func (enc *RleBooleanEncoder) PutSpaced(in []bool, validBits []byte, validBitsOf
enc.Put(bufferOut[:nvalid])
}

// PutSpacedBitmap encodes boolean values from a bitmap with validity information.
// Note: RleBooleanEncoder buffers values as []bool for RLE encoding at flush time,
// so we extract valid bits to []bool. This is still better than the caller doing
// the full bitmap-to-[]bool conversion for all values.
func (enc *RleBooleanEncoder) PutSpacedBitmap(bitmap []byte, bitmapOffset int64, numValues int64, validBits []byte, validBitsOffset int64) int64 {
if numValues == 0 {
return 0
}

// Count valid values
numValid := int64(bitutil.CountSetBits(validBits, int(validBitsOffset), int(numValues)))
if numValid == 0 {
return 0
}

// Extract valid bits to []bool for buffering
// Use SetBitRunReader to efficiently iterate over valid values
bufferOut := make([]bool, numValid)
idx := 0

reader := bitutils.NewSetBitRunReader(validBits, validBitsOffset, numValues)
for {
run := reader.NextRun()
if run.Length == 0 {
break
}

// Convert this run of bits to bools
for i := int64(0); i < run.Length; i++ {
bufferOut[idx] = bitutil.BitIsSet(bitmap, int(bitmapOffset+run.Pos+i))
idx++
}
}

enc.Put(bufferOut)
return numValid
}

func (enc *RleBooleanEncoder) EstimatedDataEncodedSize() int64 {
return rleLengthInBytes + int64(enc.maxRleBufferSize())
}
Expand Down
Loading