From 54bc343e29c43c0d8c18d95b2ee796fd2afa5c44 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 6 Feb 2026 18:48:08 -0500 Subject: [PATCH 01/22] Add alp to the thrift definitions --- parquet/src/basic.rs | 10 ++++++++++ parquet/src/format.rs | 5 ++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index ba8ffc2e92c3..b251058bd6e9 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -634,6 +634,10 @@ enum Encoding { /// afterwards. Note that the use of this encoding with FIXED_LEN_BYTE_ARRAY(N) data may /// perform poorly for large values of N. BYTE_STREAM_SPLIT = 9; + /// Adaptive Lossless floating-Point encoding (ALP). + /// + /// Currently specified for FLOAT and DOUBLE. + ALP = 10; } ); @@ -654,6 +658,7 @@ impl FromStr for Encoding { "DELTA_BYTE_ARRAY" | "delta_byte_array" => Ok(Encoding::DELTA_BYTE_ARRAY), "RLE_DICTIONARY" | "rle_dictionary" => Ok(Encoding::RLE_DICTIONARY), "BYTE_STREAM_SPLIT" | "byte_stream_split" => Ok(Encoding::BYTE_STREAM_SPLIT), + "ALP" | "alp" => Ok(Encoding::ALP), _ => Err(general_err!("unknown encoding: {}", s)), } } @@ -791,6 +796,7 @@ fn i32_to_encoding(val: i32) -> Encoding { 7 => Encoding::DELTA_BYTE_ARRAY, 8 => Encoding::RLE_DICTIONARY, 9 => Encoding::BYTE_STREAM_SPLIT, + 10 => Encoding::ALP, _ => panic!("Impossible encoding {val}"), } } @@ -2137,6 +2143,7 @@ mod tests { ); assert_eq!(Encoding::DELTA_BYTE_ARRAY.to_string(), "DELTA_BYTE_ARRAY"); assert_eq!(Encoding::RLE_DICTIONARY.to_string(), "RLE_DICTIONARY"); + assert_eq!(Encoding::ALP.to_string(), "ALP"); } #[test] @@ -2438,6 +2445,8 @@ mod tests { assert_eq!(encoding, Encoding::RLE_DICTIONARY); encoding = "BYTE_STREAM_SPLIT".parse().unwrap(); assert_eq!(encoding, Encoding::BYTE_STREAM_SPLIT); + encoding = "alp".parse().unwrap(); + assert_eq!(encoding, Encoding::ALP); // test lowercase encoding = "byte_stream_split".parse().unwrap(); @@ -2573,6 +2582,7 @@ mod tests { Encoding::PLAIN_DICTIONARY, Encoding::RLE_DICTIONARY, Encoding::BYTE_STREAM_SPLIT, + Encoding::ALP, ]; encodings_roundtrip(encodings.into()); } diff --git a/parquet/src/format.rs b/parquet/src/format.rs index 101799d00350..402f20ddf488 100644 --- a/parquet/src/format.rs +++ b/parquet/src/format.rs @@ -450,6 +450,8 @@ impl Encoding { /// Added in 2.8 for FLOAT and DOUBLE. /// Support for INT32, INT64 and FIXED_LEN_BYTE_ARRAY added in 2.11. pub const BYTE_STREAM_SPLIT: Encoding = Encoding(9); + /// Adaptive Lossless floating-Point encoding (ALP) for FLOAT and DOUBLE. + pub const ALP: Encoding = Encoding(10); pub const ENUM_VALUES: &'static [Self] = &[ Self::PLAIN, Self::PLAIN_DICTIONARY, @@ -460,6 +462,7 @@ impl Encoding { Self::DELTA_BYTE_ARRAY, Self::RLE_DICTIONARY, Self::BYTE_STREAM_SPLIT, + Self::ALP, ]; } @@ -486,6 +489,7 @@ impl From for Encoding { 7 => Encoding::DELTA_BYTE_ARRAY, 8 => Encoding::RLE_DICTIONARY, 9 => Encoding::BYTE_STREAM_SPLIT, + 10 => Encoding::ALP, _ => Encoding(i) } } @@ -6041,4 +6045,3 @@ impl crate::thrift::TSerializable for FileCryptoMetaData { o_prot.write_struct_end() } } - From 622841b3c7e05231755e7398b2cac836f3339365 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 6 Feb 2026 18:48:29 -0500 Subject: [PATCH 02/22] Add alp to the encoding enum --- parquet/src/encodings/decoding.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index 58430820a9b6..4b3e937df124 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -67,6 +67,9 @@ pub(crate) mod private { "Encoding {} is not supported for type", encoding )), + Encoding::ALP => Err(nyi_err!( + "Encoding ALP is not implemented" + )), e => Err(nyi_err!("Encoding {} is not supported", e)), } } From bf754b2eb8b53fbbbe943abf52a72c3d0fd27b9c Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Thu, 12 Feb 2026 16:43:49 -0500 Subject: [PATCH 03/22] Add alp header and page layout structs --- parquet-testing | 2 +- parquet/src/encodings/decoding.rs | 12 +- parquet/src/encodings/decoding/alp.rs | 265 ++++++++++++++++++++++++++ parquet/tests/arrow_reader/alp.rs | 41 ++++ parquet/tests/arrow_reader/mod.rs | 1 + 5 files changed, 316 insertions(+), 5 deletions(-) create mode 100644 parquet/src/encodings/decoding/alp.rs create mode 100644 parquet/tests/arrow_reader/alp.rs diff --git a/parquet-testing b/parquet-testing index a3d96a65e11e..abd2478aea41 160000 --- a/parquet-testing +++ b/parquet-testing @@ -1 +1 @@ -Subproject commit a3d96a65e11e2bbca7d22a894e8313ede90a33a3 +Subproject commit abd2478aea41d6cb10352b2277a7dc67ae20a02a diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index 4b3e937df124..36c7f6720104 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -29,10 +29,12 @@ use crate::data_type::*; use crate::encodings::decoding::byte_stream_split_decoder::{ ByteStreamSplitDecoder, VariableWidthByteStreamSplitDecoder, }; +use crate::encodings::decoding::alp::AlpDecoder; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; use crate::util::bit_util::{self, BitReader}; +mod alp; mod byte_stream_split_decoder; pub(crate) mod private { @@ -63,13 +65,11 @@ pub(crate) mod private { Encoding::RLE | Encoding::DELTA_BINARY_PACKED | Encoding::DELTA_BYTE_ARRAY - | Encoding::DELTA_LENGTH_BYTE_ARRAY => Err(general_err!( + | Encoding::DELTA_LENGTH_BYTE_ARRAY + | Encoding::ALP => Err(general_err!( "Encoding {} is not supported for type", encoding )), - Encoding::ALP => Err(nyi_err!( - "Encoding ALP is not implemented" - )), e => Err(nyi_err!("Encoding {} is not supported", e)), } } @@ -119,6 +119,7 @@ pub(crate) mod private { ) -> Result>> { match encoding { Encoding::BYTE_STREAM_SPLIT => Ok(Box::new(ByteStreamSplitDecoder::new())), + Encoding::ALP => Ok(Box::new(AlpDecoder::new())), _ => get_decoder_default(descr, encoding), } } @@ -130,6 +131,7 @@ pub(crate) mod private { ) -> Result>> { match encoding { Encoding::BYTE_STREAM_SPLIT => Ok(Box::new(ByteStreamSplitDecoder::new())), + Encoding::ALP => Ok(Box::new(AlpDecoder::new())), _ => get_decoder_default(descr, encoding), } } @@ -1138,6 +1140,8 @@ mod tests { create_and_check_decoder::(Encoding::DELTA_LENGTH_BYTE_ARRAY, None); create_and_check_decoder::(Encoding::DELTA_BYTE_ARRAY, None); create_and_check_decoder::(Encoding::RLE, None); + create_and_check_decoder::(Encoding::ALP, None); + create_and_check_decoder::(Encoding::ALP, None); // error when initializing create_and_check_decoder::( diff --git a/parquet/src/encodings/decoding/alp.rs b/parquet/src/encodings/decoding/alp.rs new file mode 100644 index 000000000000..fa436961fcdb --- /dev/null +++ b/parquet/src/encodings/decoding/alp.rs @@ -0,0 +1,265 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::marker::PhantomData; + +use bytes::Bytes; + +use crate::basic::Encoding; +use crate::data_type::DataType; +use crate::encodings::decoding::Decoder; +use crate::errors::{ParquetError, Result}; + +const ALP_HEADER_SIZE: usize = 8; +const ALP_VERSION: u8 = 1; +const ALP_COMPRESSION_MODE: u8 = 0; +const ALP_INTEGER_ENCODING_FOR_BIT_PACK: u8 = 0; +const ALP_MAX_LOG_VECTOR_SIZE: u8 = 16; + +#[derive(Debug, Clone, Copy)] +struct AlpHeader { + version: u8, + compression_mode: u8, + integer_encoding: u8, + log_vector_size: u8, + num_elements: u32, +} + +#[derive(Debug)] +struct AlpPageLayout { + header: AlpHeader, + offsets: Vec, +} + +fn parse_alp_page_layout(data: &[u8]) -> Result { + if data.len() < ALP_HEADER_SIZE { + return Err(general_err!( + "Invalid ALP page: expected at least {} bytes for header, got {}", + ALP_HEADER_SIZE, + data.len() + )); + } + + let header = AlpHeader { + version: data[0], + compression_mode: data[1], + integer_encoding: data[2], + log_vector_size: data[3], + num_elements: u32::from_le_bytes([data[4], data[5], data[6], data[7]]), + }; + + if header.version != ALP_VERSION { + return Err(general_err!( + "Invalid ALP page: unsupported version {}, expected {}", + header.version, + ALP_VERSION + )); + } + + if header.compression_mode != ALP_COMPRESSION_MODE { + return Err(general_err!( + "Invalid ALP page: unsupported compression mode {}", + header.compression_mode + )); + } + + if header.integer_encoding != ALP_INTEGER_ENCODING_FOR_BIT_PACK { + return Err(general_err!( + "Invalid ALP page: unsupported integer encoding {}", + header.integer_encoding + )); + } + + if header.log_vector_size > ALP_MAX_LOG_VECTOR_SIZE { + return Err(general_err!( + "Invalid ALP page: log_vector_size {} exceeds max {}", + header.log_vector_size, + ALP_MAX_LOG_VECTOR_SIZE + )); + } + + let vector_size = 1usize << header.log_vector_size; + let num_vectors = if header.num_elements == 0 { + 0 + } else { + (header.num_elements as usize).div_ceil(vector_size) + }; + + let offsets_len = num_vectors + .checked_mul(std::mem::size_of::()) + .ok_or_else(|| general_err!("Invalid ALP page: offsets length overflow"))?; + let offsets_end = ALP_HEADER_SIZE + .checked_add(offsets_len) + .ok_or_else(|| general_err!("Invalid ALP page: header + offsets length overflow"))?; + + if data.len() < offsets_end { + return Err(general_err!( + "Invalid ALP page: expected at least {} bytes for {} offsets, got {}", + offsets_end, + num_vectors, + data.len() + )); + } + + let body_len = data.len() - ALP_HEADER_SIZE; + let mut offsets = Vec::with_capacity(num_vectors); + for i in 0..num_vectors { + let start = ALP_HEADER_SIZE + i * 4; + let offset = u32::from_le_bytes([ + data[start], + data[start + 1], + data[start + 2], + data[start + 3], + ]); + if offset as usize >= body_len { + return Err(general_err!( + "Invalid ALP page: vector offset {} out of bounds for body length {}", + offset, + body_len + )); + } + offsets.push(offset); + } + + Ok(AlpPageLayout { header, offsets }) +} + +pub(crate) struct AlpDecoder { + num_values: usize, + layout: Option, + _marker: PhantomData, +} + +impl AlpDecoder { + pub(crate) fn new() -> Self { + Self { + num_values: 0, + layout: None, + _marker: PhantomData, + } + } +} + +impl Decoder for AlpDecoder { + fn set_data(&mut self, data: Bytes, num_values: usize) -> Result<()> { + let layout = parse_alp_page_layout(data.as_ref())?; + if layout.header.num_elements as usize != num_values { + return Err(general_err!( + "Invalid ALP page: header num_elements {} does not match page num_values {}", + layout.header.num_elements, + num_values + )); + } + + self.num_values = num_values; + self.layout = Some(layout); + Ok(()) + } + + fn get(&mut self, _buffer: &mut [T::T]) -> Result { + // Parsing succeeds in set_data; value decoding is introduced in a follow-up step. + let num_vectors = self + .layout + .as_ref() + .map(|layout| layout.offsets.len()) + .unwrap_or(0); + Err(nyi_err!( + "Encoding ALP page layout parsed ({} vectors), value decoding is not implemented", + num_vectors + )) + } + + fn values_left(&self) -> usize { + self.num_values + } + + fn encoding(&self) -> Encoding { + Encoding::ALP + } + + fn skip(&mut self, num_values: usize) -> Result { + let skipped = num_values.min(self.num_values); + self.num_values -= skipped; + Ok(skipped) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_alp_page_bytes( + version: u8, + compression_mode: u8, + integer_encoding: u8, + log_vector_size: u8, + num_elements: u32, + offsets: &[u32], + body_tail_len: usize, + ) -> Vec { + let mut out = Vec::with_capacity(ALP_HEADER_SIZE + offsets.len() * 4 + body_tail_len); + out.push(version); + out.push(compression_mode); + out.push(integer_encoding); + out.push(log_vector_size); + out.extend_from_slice(&num_elements.to_le_bytes()); + for offset in offsets { + out.extend_from_slice(&offset.to_le_bytes()); + } + out.extend(std::iter::repeat_n(0u8, body_tail_len)); + out + } + + #[test] + fn test_parse_alp_page_layout_valid() { + // num_elements=4 with vector_size=4 -> one vector, one offset entry. + let data = make_alp_page_bytes(1, 0, 0, 2, 4, &[4], 8); + let parsed = parse_alp_page_layout(&data).unwrap(); + assert_eq!(parsed.header.version, 1); + assert_eq!(parsed.header.num_elements, 4); + assert_eq!(parsed.offsets, vec![4]); + } + + #[test] + fn test_parse_alp_page_layout_short_header() { + let err = parse_alp_page_layout(&[0, 1, 2]).unwrap_err(); + assert!( + err.to_string() + .contains("Invalid ALP page: expected at least 8 bytes for header") + ); + } + + #[test] + fn test_parse_alp_page_layout_invalid_log_vector_size() { + let data = make_alp_page_bytes(1, 0, 0, 17, 1, &[4], 8); + let err = parse_alp_page_layout(&data).unwrap_err(); + assert!( + err.to_string() + .contains("Invalid ALP page: log_vector_size 17 exceeds max 16") + ); + } + + #[test] + fn test_parse_alp_page_layout_invalid_integer_encoding() { + let data = make_alp_page_bytes(1, 0, 1, 2, 1, &[4], 8); + let err = parse_alp_page_layout(&data).unwrap_err(); + assert!( + err.to_string() + .contains("Invalid ALP page: unsupported integer encoding 1") + ); + } +} diff --git a/parquet/tests/arrow_reader/alp.rs b/parquet/tests/arrow_reader/alp.rs new file mode 100644 index 000000000000..e92a1ef7797f --- /dev/null +++ b/parquet/tests/arrow_reader/alp.rs @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::util::test_util::parquet_test_data; +use parquet::arrow::arrow_reader::ArrowReaderBuilder; + +#[test] +fn test_read_single_f64_alp_page_layout() { + let path = std::path::PathBuf::from(parquet_test_data()).join("single_f64_ALP.parquet"); + if !path.exists() { + eprintln!("Skipping ALP test file not found: {}", path.display()); + return; + } + + let file = std::fs::File::open(path).unwrap(); + let mut reader = ArrowReaderBuilder::try_new(file).unwrap().build().unwrap(); + + let err = loop { + match reader.next() { + Some(Ok(_)) => continue, + Some(Err(err)) => break err, + None => panic!("Expected ALP decode to fail with NYI"), + } + }; + + assert!(err.to_string().contains("Encoding ALP page layout parsed")); +} diff --git a/parquet/tests/arrow_reader/mod.rs b/parquet/tests/arrow_reader/mod.rs index ffc36655b39a..24bfae76e081 100644 --- a/parquet/tests/arrow_reader/mod.rs +++ b/parquet/tests/arrow_reader/mod.rs @@ -38,6 +38,7 @@ use parquet::file::properties::{ use std::sync::Arc; use tempfile::NamedTempFile; +mod alp; mod bad_data; mod bloom_filter; #[cfg(feature = "crc")] From 8337fcd18f030b67b111f219838029f51d3b890e Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Sun, 15 Feb 2026 11:14:51 -0500 Subject: [PATCH 04/22] =?UTF-8?q?Add=20Interleaved=20Vectors=E2=80=8B=20su?= =?UTF-8?q?pport?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- parquet/src/encodings/decoding/alp.rs | 524 ++++++++++++++++++++++++-- 1 file changed, 498 insertions(+), 26 deletions(-) diff --git a/parquet/src/encodings/decoding/alp.rs b/parquet/src/encodings/decoding/alp.rs index fa436961fcdb..da9adb0ba3df 100644 --- a/parquet/src/encodings/decoding/alp.rs +++ b/parquet/src/encodings/decoding/alp.rs @@ -16,6 +16,7 @@ // under the License. use std::marker::PhantomData; +use std::ops::Range; use bytes::Bytes; @@ -30,22 +31,287 @@ const ALP_COMPRESSION_MODE: u8 = 0; const ALP_INTEGER_ENCODING_FOR_BIT_PACK: u8 = 0; const ALP_MAX_LOG_VECTOR_SIZE: u8 = 16; +/// Page-level ALP header (version 1, 8 bytes). +/// +/// Layout in bytes: +/// - `[0]` `version` +/// - `[1]` `compression_mode` +/// - `[2]` `integer_encoding` +/// - `[3]` `log_vector_size` +/// - `[4..8]` `num_elements` (little-endian `i32`) +/// +/// This mirrors the C++ `AlpHeader` in `arrow/util/alp/alp_wrapper.cc`. #[derive(Debug, Clone, Copy)] struct AlpHeader { version: u8, compression_mode: u8, integer_encoding: u8, log_vector_size: u8, - num_elements: u32, + num_elements: i32, } +impl AlpHeader { + fn num_elements_usize(&self) -> usize { + // `num_elements` is serialized as i32, matching Parquet page header `num_values`. + // Parsing rejects negative values, so this conversion is safe. + self.num_elements as usize + } + + /// Returns `2 ^ log_vector_size`. + fn vector_size(&self) -> usize { + 1usize << self.log_vector_size + } + + /// Number of vectors in the page. + /// + /// All vectors are full-sized except possibly the last one. + fn num_vectors(&self) -> usize { + if self.num_elements == 0 { + 0 + } else { + self.num_elements_usize().div_ceil(self.vector_size()) + } + } + + /// Number of logical values in `vector_index`. + fn vector_num_elements(&self, vector_index: usize) -> u16 { + let vector_size = self.vector_size(); + let num_full_vectors = self.num_elements_usize() / vector_size; + let remainder = self.num_elements_usize() % vector_size; + if vector_index < num_full_vectors { + vector_size as u16 + } else if vector_index == num_full_vectors && remainder > 0 { + remainder as u16 + } else { + 0 + } + } +} + +/// Per-vector ALP metadata (4 bytes), equivalent to C++ `AlpEncodedVectorInfo`. +/// +/// Layout in bytes: +/// - `[0]` `exponent` +/// - `[1]` `factor` +/// - `[2..4]` `num_exceptions` (little-endian `u16`) +#[derive(Debug, Clone, Copy)] +struct AlpEncodedVectorInfo { + exponent: u8, + factor: u8, + num_exceptions: u16, +} + +impl AlpEncodedVectorInfo { + const STORED_SIZE: usize = 4; +} + +/// Per-vector FOR metadata for a given exact integer type (`u32` for `f32`, +/// `u64` for `f64`), equivalent to C++ `AlpEncodedForVectorInfo`. +#[derive(Debug, Clone, Copy)] +struct AlpEncodedForVectorInfo { + frame_of_reference: Exact, + bit_width: u8, +} + +impl AlpEncodedForVectorInfo { + /// Serialized size of FOR metadata (`Exact::WIDTH + 1` for `bit_width`). + fn stored_size() -> usize { + Exact::WIDTH + 1 + } + + /// Number of bytes used for bit-packed encoded integers. + fn get_bit_packed_size(&self, num_elements: u16) -> usize { + (self.bit_width as usize * num_elements as usize).div_ceil(8) + } + + /// Data-only size: + /// `[packed_values][exception_positions][exception_values]` + fn get_data_stored_size(&self, num_elements: u16, num_exceptions: u16) -> usize { + let bit_packed_size = self.get_bit_packed_size(num_elements); + bit_packed_size + + num_exceptions as usize * std::mem::size_of::() + + num_exceptions as usize * Exact::WIDTH + } +} + +/// Parsed view of one vector's metadata and data slices. +/// +/// Note: +/// - `packed_values` is expressed as a range into the vector data section +/// - exceptions are copied into owned vectors for safety and simplicity #[derive(Debug)] -struct AlpPageLayout { +struct AlpEncodedVectorView { + num_elements: u16, + alp_info: AlpEncodedVectorInfo, + for_info: AlpEncodedForVectorInfo, + packed_values: Range, + exception_positions: Vec, + exception_values: Vec, +} + +/// Parsed ALP layout for a page with a concrete exact integer type. +/// +/// `Exact = u32` for `f32` pages and `Exact = u64` for `f64` pages. +#[derive(Debug)] +struct AlpPageLayout { header: AlpHeader, offsets: Vec, + vectors: Vec>, +} + +/// Type-erased page layout used by `AlpDecoder` so we can keep one decoder +/// implementation while preserving exact-width FOR storage. +#[derive(Debug)] +enum AlpPageLayoutAny { + F32(AlpPageLayout), + F64(AlpPageLayout), +} + +impl AlpPageLayoutAny { + fn num_vectors(&self) -> usize { + match self { + Self::F32(layout) => layout.vectors.len(), + Self::F64(layout) => layout.vectors.len(), + } + } + + fn num_offsets(&self) -> usize { + match self { + Self::F32(layout) => layout.offsets.len(), + Self::F64(layout) => layout.offsets.len(), + } + } + + fn parsed_values(&self) -> usize { + match self { + Self::F32(layout) => layout.vectors.iter().map(|v| v.num_elements as usize).sum(), + Self::F64(layout) => layout.vectors.iter().map(|v| v.num_elements as usize).sum(), + } + } + + fn total_exceptions(&self) -> usize { + match self { + Self::F32(layout) => layout + .vectors + .iter() + .map(|v| v.alp_info.num_exceptions as usize) + .sum(), + Self::F64(layout) => layout + .vectors + .iter() + .map(|v| v.alp_info.num_exceptions as usize) + .sum(), + } + } + + fn total_packed_bytes(&self) -> usize { + match self { + Self::F32(layout) => layout + .vectors + .iter() + .map(|v| v.packed_values.end - v.packed_values.start) + .sum(), + Self::F64(layout) => layout + .vectors + .iter() + .map(|v| v.packed_values.end - v.packed_values.start) + .sum(), + } + } + + fn total_exception_bytes(&self) -> usize { + match self { + Self::F32(layout) => layout.vectors.iter().map(|v| v.exception_values.len()).sum(), + Self::F64(layout) => layout.vectors.iter().map(|v| v.exception_values.len()).sum(), + } + } + + fn sum_for_xor(&self) -> u64 { + match self { + Self::F32(layout) => layout + .vectors + .iter() + .fold(0u64, |acc, v| acc ^ v.for_info.frame_of_reference.to_u64()), + Self::F64(layout) => layout + .vectors + .iter() + .fold(0u64, |acc, v| acc ^ v.for_info.frame_of_reference.to_u64()), + } + } + + fn sum_positions(&self) -> usize { + match self { + Self::F32(layout) => layout + .vectors + .iter() + .flat_map(|v| v.exception_positions.iter()) + .map(|v| *v as usize) + .sum(), + Self::F64(layout) => layout + .vectors + .iter() + .flat_map(|v| v.exception_positions.iter()) + .map(|v| *v as usize) + .sum(), + } + } +} + +/// Exact integer type used by FOR reconstruction. +/// +/// This mirrors C++: +/// - `float` -> `uint32_t` +/// - `double` -> `uint64_t` +/// +/// Why unsigned (not `i32`/`i64`)? +/// - FOR computes and stores deltas (`value - frame_of_reference`) that are +/// encoded with bitpacking and are naturally treated as non-negative bit +/// patterns. +/// - Using unsigned exact-width integers avoids signed-overflow edge cases in +/// the FOR stage and matches the C++ implementation's arithmetic model. +/// - Signed interpretation is still applied later during ALP decimal +/// reconstruction (after inverse FOR), so we preserve behavior while keeping +/// FOR metadata compact and byte-compatible. +trait AlpExact: Copy + std::fmt::Debug { + const WIDTH: usize; + fn from_le_slice(slice: &[u8]) -> Self; + fn to_u64(self) -> u64; +} + +impl AlpExact for u32 { + const WIDTH: usize = 4; + + fn from_le_slice(slice: &[u8]) -> Self { + u32::from_le_bytes([slice[0], slice[1], slice[2], slice[3]]) + } + + fn to_u64(self) -> u64 { + self as u64 + } +} + +impl AlpExact for u64 { + const WIDTH: usize = 8; + + fn from_le_slice(slice: &[u8]) -> Self { + u64::from_le_bytes([ + slice[0], slice[1], slice[2], slice[3], slice[4], slice[5], slice[6], slice[7], + ]) + } + + fn to_u64(self) -> u64 { + self + } } -fn parse_alp_page_layout(data: &[u8]) -> Result { +/// Parse and validate a full ALP page body. +/// +/// The caller provides `Exact` (`u32` or `u64`) based on the physical type. +/// Parsing validates: +/// - header fields and version +/// - offsets section bounds and monotonicity +/// - per-vector metadata and data section sizes +fn parse_alp_page_layout(data: &[u8]) -> Result> { if data.len() < ALP_HEADER_SIZE { return Err(general_err!( "Invalid ALP page: expected at least {} bytes for header, got {}", @@ -59,7 +325,7 @@ fn parse_alp_page_layout(data: &[u8]) -> Result { compression_mode: data[1], integer_encoding: data[2], log_vector_size: data[3], - num_elements: u32::from_le_bytes([data[4], data[5], data[6], data[7]]), + num_elements: i32::from_le_bytes([data[4], data[5], data[6], data[7]]), }; if header.version != ALP_VERSION { @@ -91,13 +357,14 @@ fn parse_alp_page_layout(data: &[u8]) -> Result { ALP_MAX_LOG_VECTOR_SIZE )); } + if header.num_elements < 0 { + return Err(general_err!( + "Invalid ALP page: num_elements {} must be >= 0", + header.num_elements + )); + } - let vector_size = 1usize << header.log_vector_size; - let num_vectors = if header.num_elements == 0 { - 0 - } else { - (header.num_elements as usize).div_ceil(vector_size) - }; + let num_vectors = header.num_vectors(); let offsets_len = num_vectors .checked_mul(std::mem::size_of::()) @@ -116,6 +383,7 @@ fn parse_alp_page_layout(data: &[u8]) -> Result { } let body_len = data.len() - ALP_HEADER_SIZE; + let offsets_section_size = num_vectors * std::mem::size_of::(); let mut offsets = Vec::with_capacity(num_vectors); for i in 0..num_vectors { let start = ALP_HEADER_SIZE + i * 4; @@ -132,15 +400,130 @@ fn parse_alp_page_layout(data: &[u8]) -> Result { body_len )); } + if (offset as usize) < offsets_section_size { + return Err(general_err!( + "Invalid ALP page: vector offset {} points into offsets section {}", + offset, + offsets_section_size + )); + } offsets.push(offset); } - Ok(AlpPageLayout { header, offsets }) + let body = &data[ALP_HEADER_SIZE..]; + let mut vectors = Vec::with_capacity(num_vectors); + for (vector_idx, vector_offset) in offsets.iter().enumerate() { + let vector_start = *vector_offset as usize; + let vector_end = if vector_idx + 1 < offsets.len() { + offsets[vector_idx + 1] as usize + } else { + body_len + }; + + if vector_end < vector_start { + return Err(general_err!( + "Invalid ALP page: vector offsets are not monotonic at index {}", + vector_idx + )); + } + + let vector_num_elements = header.vector_num_elements(vector_idx); + let vector_view = parse_vector_view( + &body[vector_start..vector_end], + vector_num_elements, + )?; + vectors.push(vector_view); + } + + Ok(AlpPageLayout { + header, + offsets, + vectors, + }) +} + +/// Parse one vector: +/// `[AlpEncodedVectorInfo][AlpEncodedForVectorInfo][data-only section]` +/// +/// Data-only section layout: +/// `[packed_values][exception_positions][exception_values]` +fn parse_vector_view( + vector_bytes: &[u8], + num_elements: u16, +) -> Result> { + let metadata_size = AlpEncodedVectorInfo::STORED_SIZE + + AlpEncodedForVectorInfo::::stored_size(); + if vector_bytes.len() < metadata_size { + return Err(general_err!( + "Invalid ALP page: vector metadata too short, expected at least {} bytes, got {}", + metadata_size, + vector_bytes.len() + )); + } + + let alp_info = AlpEncodedVectorInfo { + exponent: vector_bytes[0], + factor: vector_bytes[1], + num_exceptions: u16::from_le_bytes([vector_bytes[2], vector_bytes[3]]), + }; + + let for_start = AlpEncodedVectorInfo::STORED_SIZE; + let for_end = for_start + Exact::WIDTH; + let frame_of_reference = Exact::from_le_slice(&vector_bytes[for_start..for_end]); + let bit_width = vector_bytes[for_end]; + if bit_width as usize > Exact::WIDTH * 8 { + return Err(general_err!( + "Invalid ALP page: bit width {} exceeds {}", + bit_width, + Exact::WIDTH * 8 + )); + } + + let for_info = AlpEncodedForVectorInfo:: { + frame_of_reference, + bit_width, + }; + + let data_size = for_info.get_data_stored_size(num_elements, alp_info.num_exceptions); + if vector_bytes.len() < metadata_size + data_size { + return Err(general_err!( + "Invalid ALP page: vector data too short, expected at least {} bytes, got {}", + metadata_size + data_size, + vector_bytes.len() + )); + } + + let data = &vector_bytes[metadata_size..metadata_size + data_size]; + let packed_size = for_info.get_bit_packed_size(num_elements); + let positions_size = alp_info.num_exceptions as usize * std::mem::size_of::(); + let values_size = alp_info.num_exceptions as usize * Exact::WIDTH; + + let packed_start = 0; + let packed_end = packed_start + packed_size; + let positions_start = packed_end; + let positions_end = positions_start + positions_size; + let values_start = positions_end; + let values_end = values_start + values_size; + + let mut exception_positions = Vec::with_capacity(alp_info.num_exceptions as usize); + for chunk in data[positions_start..positions_end].chunks_exact(2) { + exception_positions.push(u16::from_le_bytes([chunk[0], chunk[1]])); + } + let exception_values = data[values_start..values_end].to_vec(); + + Ok(AlpEncodedVectorView { + num_elements, + alp_info, + for_info, + packed_values: packed_start..packed_end, + exception_positions, + exception_values, + }) } pub(crate) struct AlpDecoder { num_values: usize, - layout: Option, + layout: Option, _marker: PhantomData, } @@ -156,11 +539,25 @@ impl AlpDecoder { impl Decoder for AlpDecoder { fn set_data(&mut self, data: Bytes, num_values: usize) -> Result<()> { - let layout = parse_alp_page_layout(data.as_ref())?; - if layout.header.num_elements as usize != num_values { + // Keep exact-width FOR values per page type, matching C++ layout choices. + let layout = match std::mem::size_of::() { + 4 => AlpPageLayoutAny::F32(parse_alp_page_layout::(data.as_ref())?), + 8 => AlpPageLayoutAny::F64(parse_alp_page_layout::(data.as_ref())?), + type_size => { + return Err(general_err!( + "Invalid ALP page: exact type size {} is unsupported", + type_size + )) + } + }; + let header_num_elements = match &layout { + AlpPageLayoutAny::F32(layout) => layout.header.num_elements, + AlpPageLayoutAny::F64(layout) => layout.header.num_elements, + }; + if header_num_elements as usize != num_values { return Err(general_err!( "Invalid ALP page: header num_elements {} does not match page num_values {}", - layout.header.num_elements, + header_num_elements, num_values )); } @@ -171,15 +568,45 @@ impl Decoder for AlpDecoder { } fn get(&mut self, _buffer: &mut [T::T]) -> Result { - // Parsing succeeds in set_data; value decoding is introduced in a follow-up step. - let num_vectors = self + // Layout parsing succeeds in `set_data`; value decode is implemented next. + let num_vectors = self.layout.as_ref().map(|layout| layout.num_vectors()).unwrap_or(0); + let num_offsets = self.layout.as_ref().map(|layout| layout.num_offsets()).unwrap_or(0); + let parsed_values = self + .layout + .as_ref() + .map(|layout| layout.parsed_values()) + .unwrap_or(0); + let total_exceptions = self + .layout + .as_ref() + .map(|layout| layout.total_exceptions()) + .unwrap_or(0); + let total_packed_bytes = self + .layout + .as_ref() + .map(|layout| layout.total_packed_bytes()) + .unwrap_or(0); + let total_exception_bytes = self + .layout + .as_ref() + .map(|layout| layout.total_exception_bytes()) + .unwrap_or(0); + let sum_for = self.layout.as_ref().map(|layout| layout.sum_for_xor()).unwrap_or(0); + let sum_positions = self .layout .as_ref() - .map(|layout| layout.offsets.len()) + .map(|layout| layout.sum_positions()) .unwrap_or(0); Err(nyi_err!( - "Encoding ALP page layout parsed ({} vectors), value decoding is not implemented", - num_vectors + "Encoding ALP page layout parsed ({} vectors, {} offsets, {} values, {} exceptions, {} packed bytes, {} exception bytes, for-xor {}, pos-sum {}), value decoding is not implemented", + num_vectors, + num_offsets, + parsed_values, + total_exceptions, + total_packed_bytes, + total_exception_bytes, + sum_for, + sum_positions )) } @@ -207,7 +634,7 @@ mod tests { compression_mode: u8, integer_encoding: u8, log_vector_size: u8, - num_elements: u32, + num_elements: i32, offsets: &[u32], body_tail_len: usize, ) -> Vec { @@ -227,8 +654,8 @@ mod tests { #[test] fn test_parse_alp_page_layout_valid() { // num_elements=4 with vector_size=4 -> one vector, one offset entry. - let data = make_alp_page_bytes(1, 0, 0, 2, 4, &[4], 8); - let parsed = parse_alp_page_layout(&data).unwrap(); + let data = make_alp_page_bytes(1, 0, 0, 2, 4, &[4], 13); + let parsed = parse_alp_page_layout::(&data).unwrap(); assert_eq!(parsed.header.version, 1); assert_eq!(parsed.header.num_elements, 4); assert_eq!(parsed.offsets, vec![4]); @@ -236,7 +663,7 @@ mod tests { #[test] fn test_parse_alp_page_layout_short_header() { - let err = parse_alp_page_layout(&[0, 1, 2]).unwrap_err(); + let err = parse_alp_page_layout::(&[0, 1, 2]).unwrap_err(); assert!( err.to_string() .contains("Invalid ALP page: expected at least 8 bytes for header") @@ -246,7 +673,7 @@ mod tests { #[test] fn test_parse_alp_page_layout_invalid_log_vector_size() { let data = make_alp_page_bytes(1, 0, 0, 17, 1, &[4], 8); - let err = parse_alp_page_layout(&data).unwrap_err(); + let err = parse_alp_page_layout::(&data).unwrap_err(); assert!( err.to_string() .contains("Invalid ALP page: log_vector_size 17 exceeds max 16") @@ -256,10 +683,55 @@ mod tests { #[test] fn test_parse_alp_page_layout_invalid_integer_encoding() { let data = make_alp_page_bytes(1, 0, 1, 2, 1, &[4], 8); - let err = parse_alp_page_layout(&data).unwrap_err(); + let err = parse_alp_page_layout::(&data).unwrap_err(); assert!( err.to_string() .contains("Invalid ALP page: unsupported integer encoding 1") ); } + + #[test] + fn test_parse_alp_page_layout_negative_num_elements() { + let data = make_alp_page_bytes(1, 0, 0, 2, -1, &[4], 8); + let err = parse_alp_page_layout::(&data).unwrap_err(); + assert!( + err.to_string() + .contains("Invalid ALP page: num_elements -1 must be >= 0") + ); + } + + #[test] + fn test_parse_alp_page_layout_parses_vector_view_data_only_f64() { + let mut vector = Vec::new(); + + // AlpEncodedVectorInfo: exponent=2, factor=0, num_exceptions=1 + vector.push(2); + vector.push(0); + vector.extend_from_slice(&1u16.to_le_bytes()); + + // AlpEncodedForVectorInfo: frame_of_reference=10, bit_width=0 + vector.extend_from_slice(&10u64.to_le_bytes()); + vector.push(0); + + // Packed values: bit_width=0 and num_elements=1 -> 0 bytes. + // Exception positions (1 * u16) + vector.extend_from_slice(&0u16.to_le_bytes()); + // Exception values (1 * f64 bytes) + vector.extend_from_slice(&42.5_f64.to_le_bytes()); + + let offsets = [4u32]; + let mut page = make_alp_page_bytes(1, 0, 0, 0, 1, &offsets, 0); + page.extend_from_slice(&vector); + + let parsed = parse_alp_page_layout::(&page).unwrap(); + assert_eq!(parsed.vectors.len(), 1); + assert_eq!(parsed.vectors[0].num_elements, 1); + assert_eq!(parsed.vectors[0].alp_info.num_exceptions, 1); + assert_eq!(parsed.vectors[0].for_info.bit_width, 0); + assert_eq!(parsed.vectors[0].exception_positions, vec![0]); + assert_eq!( + parsed.vectors[0].exception_values, + 42.5_f64.to_le_bytes().to_vec() + ); + } } From 148b1d51fd0d295e690bbe939e36becacab2a2bd Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Wed, 18 Feb 2026 17:19:55 -0500 Subject: [PATCH 05/22] Add bit unpacking, reverse FOR and exceptions patching --- parquet/src/encodings/decoding/alp.rs | 361 +++++++++++++++++++------- 1 file changed, 264 insertions(+), 97 deletions(-) diff --git a/parquet/src/encodings/decoding/alp.rs b/parquet/src/encodings/decoding/alp.rs index da9adb0ba3df..e234208a6daf 100644 --- a/parquet/src/encodings/decoding/alp.rs +++ b/parquet/src/encodings/decoding/alp.rs @@ -24,6 +24,7 @@ use crate::basic::Encoding; use crate::data_type::DataType; use crate::encodings::decoding::Decoder; use crate::errors::{ParquetError, Result}; +use crate::util::bit_util::{BitReader, FromBytes}; const ALP_HEADER_SIZE: usize = 8; const ALP_VERSION: u8 = 1; @@ -39,8 +40,6 @@ const ALP_MAX_LOG_VECTOR_SIZE: u8 = 16; /// - `[2]` `integer_encoding` /// - `[3]` `log_vector_size` /// - `[4..8]` `num_elements` (little-endian `i32`) -/// -/// This mirrors the C++ `AlpHeader` in `arrow/util/alp/alp_wrapper.cc`. #[derive(Debug, Clone, Copy)] struct AlpHeader { version: u8, @@ -52,19 +51,13 @@ struct AlpHeader { impl AlpHeader { fn num_elements_usize(&self) -> usize { - // `num_elements` is serialized as i32, matching Parquet page header `num_values`. - // Parsing rejects negative values, so this conversion is safe. self.num_elements as usize } - /// Returns `2 ^ log_vector_size`. fn vector_size(&self) -> usize { 1usize << self.log_vector_size } - /// Number of vectors in the page. - /// - /// All vectors are full-sized except possibly the last one. fn num_vectors(&self) -> usize { if self.num_elements == 0 { 0 @@ -73,7 +66,6 @@ impl AlpHeader { } } - /// Number of logical values in `vector_index`. fn vector_num_elements(&self, vector_index: usize) -> u16 { let vector_size = self.vector_size(); let num_full_vectors = self.num_elements_usize() / vector_size; @@ -89,11 +81,6 @@ impl AlpHeader { } /// Per-vector ALP metadata (4 bytes), equivalent to C++ `AlpEncodedVectorInfo`. -/// -/// Layout in bytes: -/// - `[0]` `exponent` -/// - `[1]` `factor` -/// - `[2..4]` `num_exceptions` (little-endian `u16`) #[derive(Debug, Clone, Copy)] struct AlpEncodedVectorInfo { exponent: u8, @@ -105,8 +92,7 @@ impl AlpEncodedVectorInfo { const STORED_SIZE: usize = 4; } -/// Per-vector FOR metadata for a given exact integer type (`u32` for `f32`, -/// `u64` for `f64`), equivalent to C++ `AlpEncodedForVectorInfo`. +/// Per-vector FOR metadata for exact integer type (`u32` for `f32`, `u64` for `f64`). #[derive(Debug, Clone, Copy)] struct AlpEncodedForVectorInfo { frame_of_reference: Exact, @@ -114,18 +100,14 @@ struct AlpEncodedForVectorInfo { } impl AlpEncodedForVectorInfo { - /// Serialized size of FOR metadata (`Exact::WIDTH + 1` for `bit_width`). fn stored_size() -> usize { Exact::WIDTH + 1 } - /// Number of bytes used for bit-packed encoded integers. fn get_bit_packed_size(&self, num_elements: u16) -> usize { (self.bit_width as usize * num_elements as usize).div_ceil(8) } - /// Data-only size: - /// `[packed_values][exception_positions][exception_values]` fn get_data_stored_size(&self, num_elements: u16, num_exceptions: u16) -> usize { let bit_packed_size = self.get_bit_packed_size(num_elements); bit_packed_size @@ -136,9 +118,12 @@ impl AlpEncodedForVectorInfo { /// Parsed view of one vector's metadata and data slices. /// -/// Note: -/// - `packed_values` is expressed as a range into the vector data section -/// - exceptions are copied into owned vectors for safety and simplicity +/// `packed_values` is a zero-copy range into page body bytes. +/// Exception positions/values are copied for straightforward decode handling. +/// Parsed view of one vector. +/// +/// `packed_values` is a byte range into [`AlpPageLayout::body`] (zero-copy), +/// matching the C++ `LoadViewDataOnly` model for packed bytes. #[derive(Debug)] struct AlpEncodedVectorView { num_elements: u16, @@ -146,21 +131,20 @@ struct AlpEncodedVectorView { for_info: AlpEncodedForVectorInfo, packed_values: Range, exception_positions: Vec, - exception_values: Vec, + exception_values: Vec, } -/// Parsed ALP layout for a page with a concrete exact integer type. -/// -/// `Exact = u32` for `f32` pages and `Exact = u64` for `f64` pages. +/// Parsed ALP page layout for one exact integer width (`u32` for float pages, +/// `u64` for double pages). #[derive(Debug)] struct AlpPageLayout { header: AlpHeader, + body: Bytes, offsets: Vec, vectors: Vec>, } -/// Type-erased page layout used by `AlpDecoder` so we can keep one decoder -/// implementation while preserving exact-width FOR storage. +/// Type-erased wrapper over parsed `f32`/`f64` page layouts. #[derive(Debug)] enum AlpPageLayoutAny { F32(AlpPageLayout), @@ -221,8 +205,16 @@ impl AlpPageLayoutAny { fn total_exception_bytes(&self) -> usize { match self { - Self::F32(layout) => layout.vectors.iter().map(|v| v.exception_values.len()).sum(), - Self::F64(layout) => layout.vectors.iter().map(|v| v.exception_values.len()).sum(), + Self::F32(layout) => layout + .vectors + .iter() + .map(|v| v.exception_values.len() * std::mem::size_of::()) + .sum(), + Self::F64(layout) => layout + .vectors + .iter() + .map(|v| v.exception_values.len() * std::mem::size_of::()) + .sum(), } } @@ -264,18 +256,15 @@ impl AlpPageLayoutAny { /// - `double` -> `uint64_t` /// /// Why unsigned (not `i32`/`i64`)? -/// - FOR computes and stores deltas (`value - frame_of_reference`) that are -/// encoded with bitpacking and are naturally treated as non-negative bit -/// patterns. -/// - Using unsigned exact-width integers avoids signed-overflow edge cases in -/// the FOR stage and matches the C++ implementation's arithmetic model. -/// - Signed interpretation is still applied later during ALP decimal -/// reconstruction (after inverse FOR), so we preserve behavior while keeping -/// FOR metadata compact and byte-compatible. +/// - FOR stores non-negative deltas optimized for bitpacking. +/// - Unsigned arithmetic avoids signed-overflow edge cases in FOR stage. +/// - Signed interpretation is applied later during decimal reconstruction. trait AlpExact: Copy + std::fmt::Debug { const WIDTH: usize; fn from_le_slice(slice: &[u8]) -> Self; fn to_u64(self) -> u64; + fn zero() -> Self; + fn wrapping_add(self, rhs: Self) -> Self; } impl AlpExact for u32 { @@ -288,6 +277,14 @@ impl AlpExact for u32 { fn to_u64(self) -> u64 { self as u64 } + + fn zero() -> Self { + 0 + } + + fn wrapping_add(self, rhs: Self) -> Self { + self.wrapping_add(rhs) + } } impl AlpExact for u64 { @@ -302,30 +299,39 @@ impl AlpExact for u64 { fn to_u64(self) -> u64 { self } + + fn zero() -> Self { + 0 + } + + fn wrapping_add(self, rhs: Self) -> Self { + self.wrapping_add(rhs) + } } -/// Parse and validate a full ALP page body. +/// Parse and validate a full ALP-encoded page body. /// -/// The caller provides `Exact` (`u32` or `u64`) based on the physical type. -/// Parsing validates: -/// - header fields and version -/// - offsets section bounds and monotonicity -/// - per-vector metadata and data section sizes -fn parse_alp_page_layout(data: &[u8]) -> Result> { - if data.len() < ALP_HEADER_SIZE { +/// Validation includes: +/// - header fields/version/encoding +/// - non-negative `num_elements` +/// - offsets bounds + monotonicity +/// - per-vector metadata/data section lengths +fn parse_alp_page_layout(data: Bytes) -> Result> { + let data_ref = data.as_ref(); + if data_ref.len() < ALP_HEADER_SIZE { return Err(general_err!( "Invalid ALP page: expected at least {} bytes for header, got {}", ALP_HEADER_SIZE, - data.len() + data_ref.len() )); } let header = AlpHeader { - version: data[0], - compression_mode: data[1], - integer_encoding: data[2], - log_vector_size: data[3], - num_elements: i32::from_le_bytes([data[4], data[5], data[6], data[7]]), + version: data_ref[0], + compression_mode: data_ref[1], + integer_encoding: data_ref[2], + log_vector_size: data_ref[3], + num_elements: i32::from_le_bytes([data_ref[4], data_ref[5], data_ref[6], data_ref[7]]), }; if header.version != ALP_VERSION { @@ -357,6 +363,7 @@ fn parse_alp_page_layout(data: &[u8]) -> Result= 0", @@ -373,26 +380,30 @@ fn parse_alp_page_layout(data: &[u8]) -> Result(); + let mut offsets = Vec::with_capacity(num_vectors); for i in 0..num_vectors { let start = ALP_HEADER_SIZE + i * 4; let offset = u32::from_le_bytes([ - data[start], - data[start + 1], - data[start + 2], - data[start + 3], + data_ref[start], + data_ref[start + 1], + data_ref[start + 2], + data_ref[start + 3], ]); + if offset as usize >= body_len { return Err(general_err!( "Invalid ALP page: vector offset {} out of bounds for body length {}", @@ -400,6 +411,7 @@ fn parse_alp_page_layout(data: &[u8]) -> Result(data: &[u8]) -> Result(data: &[u8]) -> Result( + body_ref, + vector_start, + vector_end, vector_num_elements, - )?; - vectors.push(vector_view); + )?); } Ok(AlpPageLayout { header, + body, offsets, vectors, }) } -/// Parse one vector: -/// `[AlpEncodedVectorInfo][AlpEncodedForVectorInfo][data-only section]` -/// -/// Data-only section layout: -/// `[packed_values][exception_positions][exception_values]` +/// Parse a single vector section: +/// `[AlpInfo][ForInfo][PackedValues][ExceptionPositions][ExceptionValues]`. fn parse_vector_view( - vector_bytes: &[u8], + body: &[u8], + vector_start: usize, + vector_end: usize, num_elements: u16, ) -> Result> { - let metadata_size = AlpEncodedVectorInfo::STORED_SIZE - + AlpEncodedForVectorInfo::::stored_size(); + let vector_bytes = &body[vector_start..vector_end]; + + let metadata_size = + AlpEncodedVectorInfo::STORED_SIZE + AlpEncodedForVectorInfo::::stored_size(); if vector_bytes.len() < metadata_size { return Err(general_err!( "Invalid ALP page: vector metadata too short, expected at least {} bytes, got {}", @@ -471,6 +486,7 @@ fn parse_vector_view( let for_end = for_start + Exact::WIDTH; let frame_of_reference = Exact::from_le_slice(&vector_bytes[for_start..for_end]); let bit_width = vector_bytes[for_end]; + if bit_width as usize > Exact::WIDTH * 8 { return Err(general_err!( "Invalid ALP page: bit width {} exceeds {}", @@ -509,18 +525,122 @@ fn parse_vector_view( for chunk in data[positions_start..positions_end].chunks_exact(2) { exception_positions.push(u16::from_le_bytes([chunk[0], chunk[1]])); } - let exception_values = data[values_start..values_end].to_vec(); + + let packed_values = + (vector_start + metadata_size + packed_start)..(vector_start + metadata_size + packed_end); + + let mut exception_values = Vec::with_capacity(alp_info.num_exceptions as usize); + for chunk in data[values_start..values_end].chunks_exact(Exact::WIDTH) { + exception_values.push(Exact::from_le_slice(chunk)); + } Ok(AlpEncodedVectorView { num_elements, alp_info, for_info, - packed_values: packed_start..packed_end, + packed_values, exception_positions, exception_values, }) } +/// Decode bit-packed deltas into exact integers. +fn bit_unpack_integers( + packed_values: &[u8], + bit_width: u8, + num_elements: u16, +) -> Result> { + if bit_width as usize > Exact::WIDTH * 8 { + return Err(general_err!( + "Invalid ALP page: bit width {} exceeds {}", + bit_width, + Exact::WIDTH * 8 + )); + } + + if bit_width == 0 { + return Ok(vec![Exact::zero(); num_elements as usize]); + } + + let mut out = vec![Exact::zero(); num_elements as usize]; + let mut reader = BitReader::new(Bytes::copy_from_slice(packed_values)); + let read = reader.get_batch::(&mut out, bit_width as usize); + if read != out.len() { + return Err(general_err!( + "Invalid ALP page: bit unpack read {} values, expected {}", + read, + out.len() + )); + } + + Ok(out) +} + +/// Apply inverse FOR: `decoded = delta + frame_of_reference`. +fn inverse_for(deltas: &mut [Exact], frame_of_reference: Exact) { + for value in deltas { + *value = value.wrapping_add(frame_of_reference); + } +} + +/// Patch exception values at their original positions. +fn patch_exceptions( + decoded: &mut [Exact], + exception_positions: &[u16], + exception_values: &[Exact], +) -> Result<()> { + if exception_positions.len() != exception_values.len() { + return Err(general_err!( + "Invalid ALP page: exception positions ({}) and values ({}) length mismatch", + exception_positions.len(), + exception_values.len() + )); + } + + for (pos, value) in exception_positions.iter().zip(exception_values.iter()) { + let pos = *pos as usize; + if pos >= decoded.len() { + return Err(general_err!( + "Invalid ALP page: exception position {} out of bounds for vector length {}", + pos, + decoded.len() + )); + } + decoded[pos] = *value; + } + + Ok(()) +} + +/// Decode one vector to exact integers: +/// bit-unpack -> inverse FOR -> exception patch. +fn decode_vector_exact( + body: &[u8], + vector: &AlpEncodedVectorView, +) -> Result> { + let mut decoded = bit_unpack_integers( + &body[vector.packed_values.clone()], + vector.for_info.bit_width, + vector.num_elements, + )?; + inverse_for(&mut decoded, vector.for_info.frame_of_reference); + patch_exceptions( + &mut decoded, + &vector.exception_positions, + &vector.exception_values, + )?; + Ok(decoded) +} + +/// Decode all vectors in a page into exact integers (still pre-decimal reconstruction). +fn decode_page_exact(layout: &AlpPageLayout) -> Result> { + let mut out = Vec::with_capacity(layout.header.num_elements_usize()); + for vector in &layout.vectors { + out.extend_from_slice(&decode_vector_exact(layout.body.as_ref(), vector)?); + } + Ok(out) +} + pub(crate) struct AlpDecoder { num_values: usize, layout: Option, @@ -539,10 +659,9 @@ impl AlpDecoder { impl Decoder for AlpDecoder { fn set_data(&mut self, data: Bytes, num_values: usize) -> Result<()> { - // Keep exact-width FOR values per page type, matching C++ layout choices. let layout = match std::mem::size_of::() { - 4 => AlpPageLayoutAny::F32(parse_alp_page_layout::(data.as_ref())?), - 8 => AlpPageLayoutAny::F64(parse_alp_page_layout::(data.as_ref())?), + 4 => AlpPageLayoutAny::F32(parse_alp_page_layout::(data)?), + 8 => AlpPageLayoutAny::F64(parse_alp_page_layout::(data)?), type_size => { return Err(general_err!( "Invalid ALP page: exact type size {} is unsupported", @@ -550,10 +669,12 @@ impl Decoder for AlpDecoder { )) } }; + let header_num_elements = match &layout { AlpPageLayoutAny::F32(layout) => layout.header.num_elements, AlpPageLayoutAny::F64(layout) => layout.header.num_elements, }; + if header_num_elements as usize != num_values { return Err(general_err!( "Invalid ALP page: header num_elements {} does not match page num_values {}", @@ -568,7 +689,6 @@ impl Decoder for AlpDecoder { } fn get(&mut self, _buffer: &mut [T::T]) -> Result { - // Layout parsing succeeds in `set_data`; value decode is implemented next. let num_vectors = self.layout.as_ref().map(|layout| layout.num_vectors()).unwrap_or(0); let num_offsets = self.layout.as_ref().map(|layout| layout.num_offsets()).unwrap_or(0); let parsed_values = self @@ -597,8 +717,19 @@ impl Decoder for AlpDecoder { .as_ref() .map(|layout| layout.sum_positions()) .unwrap_or(0); + + let decoded_checksum = match self.layout.as_ref() { + Some(AlpPageLayoutAny::F32(layout)) => decode_page_exact(layout)? + .into_iter() + .fold(0u64, |acc, v| acc ^ v.to_u64()), + Some(AlpPageLayoutAny::F64(layout)) => decode_page_exact(layout)? + .into_iter() + .fold(0u64, |acc, v| acc ^ v.to_u64()), + None => 0, + }; + Err(nyi_err!( - "Encoding ALP page layout parsed ({} vectors, {} offsets, {} values, {} exceptions, {} packed bytes, {} exception bytes, for-xor {}, pos-sum {}), value decoding is not implemented", + "Encoding ALP page layout parsed ({} vectors, {} offsets, {} values, {} exceptions, {} packed bytes, {} exception bytes, for-xor {}, pos-sum {}, decoded-xor {}), value decoding is not implemented", num_vectors, num_offsets, parsed_values, @@ -606,7 +737,8 @@ impl Decoder for AlpDecoder { total_packed_bytes, total_exception_bytes, sum_for, - sum_positions + sum_positions, + decoded_checksum )) } @@ -653,9 +785,8 @@ mod tests { #[test] fn test_parse_alp_page_layout_valid() { - // num_elements=4 with vector_size=4 -> one vector, one offset entry. let data = make_alp_page_bytes(1, 0, 0, 2, 4, &[4], 13); - let parsed = parse_alp_page_layout::(&data).unwrap(); + let parsed = parse_alp_page_layout::(Bytes::from(data)).unwrap(); assert_eq!(parsed.header.version, 1); assert_eq!(parsed.header.num_elements, 4); assert_eq!(parsed.offsets, vec![4]); @@ -663,7 +794,7 @@ mod tests { #[test] fn test_parse_alp_page_layout_short_header() { - let err = parse_alp_page_layout::(&[0, 1, 2]).unwrap_err(); + let err = parse_alp_page_layout::(Bytes::from_static(&[0, 1, 2])).unwrap_err(); assert!( err.to_string() .contains("Invalid ALP page: expected at least 8 bytes for header") @@ -673,7 +804,7 @@ mod tests { #[test] fn test_parse_alp_page_layout_invalid_log_vector_size() { let data = make_alp_page_bytes(1, 0, 0, 17, 1, &[4], 8); - let err = parse_alp_page_layout::(&data).unwrap_err(); + let err = parse_alp_page_layout::(Bytes::from(data)).unwrap_err(); assert!( err.to_string() .contains("Invalid ALP page: log_vector_size 17 exceeds max 16") @@ -683,7 +814,7 @@ mod tests { #[test] fn test_parse_alp_page_layout_invalid_integer_encoding() { let data = make_alp_page_bytes(1, 0, 1, 2, 1, &[4], 8); - let err = parse_alp_page_layout::(&data).unwrap_err(); + let err = parse_alp_page_layout::(Bytes::from(data)).unwrap_err(); assert!( err.to_string() .contains("Invalid ALP page: unsupported integer encoding 1") @@ -693,7 +824,7 @@ mod tests { #[test] fn test_parse_alp_page_layout_negative_num_elements() { let data = make_alp_page_bytes(1, 0, 0, 2, -1, &[4], 8); - let err = parse_alp_page_layout::(&data).unwrap_err(); + let err = parse_alp_page_layout::(Bytes::from(data)).unwrap_err(); assert!( err.to_string() .contains("Invalid ALP page: num_elements -1 must be >= 0") @@ -704,34 +835,70 @@ mod tests { fn test_parse_alp_page_layout_parses_vector_view_data_only_f64() { let mut vector = Vec::new(); - // AlpEncodedVectorInfo: exponent=2, factor=0, num_exceptions=1 vector.push(2); vector.push(0); vector.extend_from_slice(&1u16.to_le_bytes()); - // AlpEncodedForVectorInfo: frame_of_reference=10, bit_width=0 vector.extend_from_slice(&10u64.to_le_bytes()); vector.push(0); - // Packed values: bit_width=0 and num_elements=1 -> 0 bytes. - // Exception positions (1 * u16) vector.extend_from_slice(&0u16.to_le_bytes()); - // Exception values (1 * f64 bytes) vector.extend_from_slice(&42.5_f64.to_le_bytes()); let offsets = [4u32]; let mut page = make_alp_page_bytes(1, 0, 0, 0, 1, &offsets, 0); page.extend_from_slice(&vector); - let parsed = parse_alp_page_layout::(&page).unwrap(); + let parsed = parse_alp_page_layout::(Bytes::from(page)).unwrap(); assert_eq!(parsed.vectors.len(), 1); assert_eq!(parsed.vectors[0].num_elements, 1); assert_eq!(parsed.vectors[0].alp_info.num_exceptions, 1); assert_eq!(parsed.vectors[0].for_info.bit_width, 0); assert_eq!(parsed.vectors[0].exception_positions, vec![0]); - assert_eq!( - parsed.vectors[0].exception_values, - 42.5_f64.to_le_bytes().to_vec() - ); + assert_eq!(parsed.vectors[0].exception_values, vec![42.5_f64.to_bits()]); + } + + #[test] + fn test_bit_unpack_integers_width_zero() { + let unpacked = bit_unpack_integers::(&[], 0, 3).unwrap(); + assert_eq!(unpacked, vec![0, 0, 0]); + } + + #[test] + fn test_bit_unpack_integers_width_two() { + let unpacked = bit_unpack_integers::(&[0b0010_0111], 2, 3).unwrap(); + assert_eq!(unpacked, vec![3, 1, 2]); + } + + #[test] + fn test_inverse_for_and_patch_exceptions() { + let mut decoded = vec![0u32, 3, 2]; + inverse_for(&mut decoded, 10); + assert_eq!(decoded, vec![10, 13, 12]); + + patch_exceptions(&mut decoded, &[1], &[99]).unwrap(); + assert_eq!(decoded, vec![10, 99, 12]); + } + + #[test] + fn test_decode_vector_exact() { + let vector = AlpEncodedVectorView:: { + num_elements: 3, + alp_info: AlpEncodedVectorInfo { + exponent: 0, + factor: 0, + num_exceptions: 1, + }, + for_info: AlpEncodedForVectorInfo { + frame_of_reference: 10, + bit_width: 2, + }, + packed_values: 0..1, + exception_positions: vec![2], + exception_values: vec![77], + }; + + let decoded = decode_vector_exact(&[0b0010_1100], &vector).unwrap(); + assert_eq!(decoded, vec![10, 13, 77]); } } From 5f0222eac7ac9caffd42643d8b82872a1ab41245 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Wed, 18 Feb 2026 18:06:46 -0500 Subject: [PATCH 06/22] Decode page works --- parquet/src/encodings/decoding/alp.rs | 407 +++++++++++++------------- parquet/tests/arrow_reader/alp.rs | 25 +- 2 files changed, 226 insertions(+), 206 deletions(-) diff --git a/parquet/src/encodings/decoding/alp.rs b/parquet/src/encodings/decoding/alp.rs index e234208a6daf..8238da8ae929 100644 --- a/parquet/src/encodings/decoding/alp.rs +++ b/parquet/src/encodings/decoding/alp.rs @@ -121,9 +121,6 @@ impl AlpEncodedForVectorInfo { /// `packed_values` is a zero-copy range into page body bytes. /// Exception positions/values are copied for straightforward decode handling. /// Parsed view of one vector. -/// -/// `packed_values` is a byte range into [`AlpPageLayout::body`] (zero-copy), -/// matching the C++ `LoadViewDataOnly` model for packed bytes. #[derive(Debug)] struct AlpEncodedVectorView { num_elements: u16, @@ -140,115 +137,11 @@ struct AlpEncodedVectorView { struct AlpPageLayout { header: AlpHeader, body: Bytes, + #[allow(dead_code)] offsets: Vec, vectors: Vec>, } -/// Type-erased wrapper over parsed `f32`/`f64` page layouts. -#[derive(Debug)] -enum AlpPageLayoutAny { - F32(AlpPageLayout), - F64(AlpPageLayout), -} - -impl AlpPageLayoutAny { - fn num_vectors(&self) -> usize { - match self { - Self::F32(layout) => layout.vectors.len(), - Self::F64(layout) => layout.vectors.len(), - } - } - - fn num_offsets(&self) -> usize { - match self { - Self::F32(layout) => layout.offsets.len(), - Self::F64(layout) => layout.offsets.len(), - } - } - - fn parsed_values(&self) -> usize { - match self { - Self::F32(layout) => layout.vectors.iter().map(|v| v.num_elements as usize).sum(), - Self::F64(layout) => layout.vectors.iter().map(|v| v.num_elements as usize).sum(), - } - } - - fn total_exceptions(&self) -> usize { - match self { - Self::F32(layout) => layout - .vectors - .iter() - .map(|v| v.alp_info.num_exceptions as usize) - .sum(), - Self::F64(layout) => layout - .vectors - .iter() - .map(|v| v.alp_info.num_exceptions as usize) - .sum(), - } - } - - fn total_packed_bytes(&self) -> usize { - match self { - Self::F32(layout) => layout - .vectors - .iter() - .map(|v| v.packed_values.end - v.packed_values.start) - .sum(), - Self::F64(layout) => layout - .vectors - .iter() - .map(|v| v.packed_values.end - v.packed_values.start) - .sum(), - } - } - - fn total_exception_bytes(&self) -> usize { - match self { - Self::F32(layout) => layout - .vectors - .iter() - .map(|v| v.exception_values.len() * std::mem::size_of::()) - .sum(), - Self::F64(layout) => layout - .vectors - .iter() - .map(|v| v.exception_values.len() * std::mem::size_of::()) - .sum(), - } - } - - fn sum_for_xor(&self) -> u64 { - match self { - Self::F32(layout) => layout - .vectors - .iter() - .fold(0u64, |acc, v| acc ^ v.for_info.frame_of_reference.to_u64()), - Self::F64(layout) => layout - .vectors - .iter() - .fold(0u64, |acc, v| acc ^ v.for_info.frame_of_reference.to_u64()), - } - } - - fn sum_positions(&self) -> usize { - match self { - Self::F32(layout) => layout - .vectors - .iter() - .flat_map(|v| v.exception_positions.iter()) - .map(|v| *v as usize) - .sum(), - Self::F64(layout) => layout - .vectors - .iter() - .flat_map(|v| v.exception_positions.iter()) - .map(|v| *v as usize) - .sum(), - } - } -} - /// Exact integer type used by FOR reconstruction. /// /// This mirrors C++: @@ -261,23 +154,21 @@ impl AlpPageLayoutAny { /// - Signed interpretation is applied later during decimal reconstruction. trait AlpExact: Copy + std::fmt::Debug { const WIDTH: usize; + type Signed: Copy; fn from_le_slice(slice: &[u8]) -> Self; - fn to_u64(self) -> u64; fn zero() -> Self; fn wrapping_add(self, rhs: Self) -> Self; + fn reinterpret_as_signed(self) -> Self::Signed; } impl AlpExact for u32 { const WIDTH: usize = 4; + type Signed = i32; fn from_le_slice(slice: &[u8]) -> Self { u32::from_le_bytes([slice[0], slice[1], slice[2], slice[3]]) } - fn to_u64(self) -> u64 { - self as u64 - } - fn zero() -> Self { 0 } @@ -285,10 +176,15 @@ impl AlpExact for u32 { fn wrapping_add(self, rhs: Self) -> Self { self.wrapping_add(rhs) } + + fn reinterpret_as_signed(self) -> Self::Signed { + i32::from_ne_bytes(self.to_ne_bytes()) + } } impl AlpExact for u64 { const WIDTH: usize = 8; + type Signed = i64; fn from_le_slice(slice: &[u8]) -> Self { u64::from_le_bytes([ @@ -296,10 +192,6 @@ impl AlpExact for u64 { ]) } - fn to_u64(self) -> u64 { - self - } - fn zero() -> Self { 0 } @@ -307,6 +199,124 @@ impl AlpExact for u64 { fn wrapping_add(self, rhs: Self) -> Self { self.wrapping_add(rhs) } + + fn reinterpret_as_signed(self) -> Self::Signed { + i64::from_ne_bytes(self.to_ne_bytes()) + } +} + +const ALP_I64_POW10: [i64; 19] = [ + 1, + 10, + 100, + 1_000, + 10_000, + 100_000, + 1_000_000, + 10_000_000, + 100_000_000, + 1_000_000_000, + 10_000_000_000, + 100_000_000_000, + 1_000_000_000_000, + 10_000_000_000_000, + 100_000_000_000_000, + 1_000_000_000_000_000, + 10_000_000_000_000_000, + 100_000_000_000_000_000, + 1_000_000_000_000_000_000, +]; + +const ALP_NEG_POW10_F32: [f32; 11] = [ + 1.0, + 0.1, + 0.01, + 0.001, + 0.0001, + 0.00001, + 0.000001, + 0.0000001, + 0.00000001, + 0.000000001, + 0.0000000001, +]; + +const ALP_NEG_POW10_F64: [f64; 19] = [ + 1.0, + 0.1, + 0.01, + 0.001, + 0.0001, + 0.00001, + 0.000001, + 0.0000001, + 0.00000001, + 0.000000001, + 0.0000000001, + 0.00000000001, + 0.000000000001, + 0.0000000000001, + 0.00000000000001, + 0.000000000000001, + 0.0000000000000001, + 0.00000000000000001, + 0.000000000000000001, +]; + +trait AlpFloat: Copy + Default { + type Exact: AlpExact + FromBytes; + + fn decode_value( + signed_encoded: ::Signed, + exponent: u8, + factor: u8, + ) -> Result; + + fn from_exact_bits(bits: Self::Exact) -> Self; +} + +impl AlpFloat for f32 { + type Exact = u32; + + fn decode_value(signed_encoded: i32, exponent: u8, factor: u8) -> Result { + let factor_multiplier = ALP_I64_POW10 + .get(factor as usize) + .ok_or_else(|| general_err!("Invalid ALP page: factor {} exceeds max 18", factor))?; + let exponent_multiplier = ALP_NEG_POW10_F32.get(exponent as usize).ok_or_else(|| { + general_err!( + "Invalid ALP page: exponent {} exceeds max {} for f32", + exponent, + ALP_NEG_POW10_F32.len() - 1 + ) + })?; + Ok((signed_encoded as f32) * (*factor_multiplier as f32) * *exponent_multiplier) + } + + fn from_exact_bits(bits: Self::Exact) -> Self { + f32::from_bits(bits) + } +} + +impl AlpFloat for f64 { + type Exact = u64; + + fn decode_value(signed_encoded: i64, exponent: u8, factor: u8) -> Result { + let factor_multiplier = ALP_I64_POW10 + .get(factor as usize) + .ok_or_else(|| general_err!("Invalid ALP page: factor {} exceeds max 18", factor))?; + let exponent_multiplier = ALP_NEG_POW10_F64.get(exponent as usize).ok_or_else(|| { + general_err!( + "Invalid ALP page: exponent {} exceeds max {} for f64", + exponent, + ALP_NEG_POW10_F64.len() - 1 + ) + })?; + Ok((signed_encoded as f64) * (*factor_multiplier as f64) * *exponent_multiplier) + } + + fn from_exact_bits(bits: Self::Exact) -> Self { + f64::from_bits(bits) + } } /// Parse and validate a full ALP-encoded page body. @@ -584,6 +594,7 @@ fn inverse_for(deltas: &mut [Exact], frame_of_reference: Exact) } /// Patch exception values at their original positions. +#[cfg(test)] fn patch_exceptions( decoded: &mut [Exact], exception_positions: &[u16], @@ -614,6 +625,7 @@ fn patch_exceptions( /// Decode one vector to exact integers: /// bit-unpack -> inverse FOR -> exception patch. +#[cfg(test)] fn decode_vector_exact( body: &[u8], vector: &AlpEncodedVectorView, @@ -632,118 +644,115 @@ fn decode_vector_exact( Ok(decoded) } -/// Decode all vectors in a page into exact integers (still pre-decimal reconstruction). -fn decode_page_exact(layout: &AlpPageLayout) -> Result> { +/// Decode one vector into output floating values: +/// bit-unpack -> inverse FOR -> decimal decode -> patch exceptions. +fn decode_vector_values( + body: &[u8], + vector: &AlpEncodedVectorView, +) -> Result> { + let mut exact_values = bit_unpack_integers( + &body[vector.packed_values.clone()], + vector.for_info.bit_width, + vector.num_elements, + )?; + inverse_for(&mut exact_values, vector.for_info.frame_of_reference); + + let mut out = Vec::with_capacity(vector.num_elements as usize); + for exact_value in exact_values { + let signed_value = exact_value.reinterpret_as_signed(); + out.push(Value::decode_value( + signed_value, + vector.alp_info.exponent, + vector.alp_info.factor, + )?); + } + + if vector.exception_positions.len() != vector.exception_values.len() { + return Err(general_err!( + "Invalid ALP page: exception positions ({}) and values ({}) length mismatch", + vector.exception_positions.len(), + vector.exception_values.len() + )); + } + + for (pos, value_bits) in vector + .exception_positions + .iter() + .zip(vector.exception_values.iter()) + { + let pos = *pos as usize; + if pos >= out.len() { + return Err(general_err!( + "Invalid ALP page: exception position {} out of bounds for vector length {}", + pos, + out.len() + )); + } + out[pos] = Value::from_exact_bits(*value_bits); + } + + Ok(out) +} + +fn decode_page_values(layout: &AlpPageLayout) -> Result> { let mut out = Vec::with_capacity(layout.header.num_elements_usize()); for vector in &layout.vectors { - out.extend_from_slice(&decode_vector_exact(layout.body.as_ref(), vector)?); + out.extend_from_slice(&decode_vector_values::( + layout.body.as_ref(), + vector, + )?); } Ok(out) } pub(crate) struct AlpDecoder { - num_values: usize, - layout: Option, + decoded_values: Vec, + values_decoded: usize, _marker: PhantomData, } impl AlpDecoder { pub(crate) fn new() -> Self { Self { - num_values: 0, - layout: None, + decoded_values: Vec::new(), + values_decoded: 0, _marker: PhantomData, } } } -impl Decoder for AlpDecoder { +impl Decoder for AlpDecoder +where + T::T: AlpFloat, +{ fn set_data(&mut self, data: Bytes, num_values: usize) -> Result<()> { - let layout = match std::mem::size_of::() { - 4 => AlpPageLayoutAny::F32(parse_alp_page_layout::(data)?), - 8 => AlpPageLayoutAny::F64(parse_alp_page_layout::(data)?), - type_size => { - return Err(general_err!( - "Invalid ALP page: exact type size {} is unsupported", - type_size - )) - } - }; + let layout = parse_alp_page_layout::<::Exact>(data)?; - let header_num_elements = match &layout { - AlpPageLayoutAny::F32(layout) => layout.header.num_elements, - AlpPageLayoutAny::F64(layout) => layout.header.num_elements, - }; - - if header_num_elements as usize != num_values { + if layout.header.num_elements_usize() != num_values { return Err(general_err!( "Invalid ALP page: header num_elements {} does not match page num_values {}", - header_num_elements, + layout.header.num_elements, num_values )); } - self.num_values = num_values; - self.layout = Some(layout); + self.decoded_values = decode_page_values::(&layout)?; + self.values_decoded = 0; Ok(()) } - fn get(&mut self, _buffer: &mut [T::T]) -> Result { - let num_vectors = self.layout.as_ref().map(|layout| layout.num_vectors()).unwrap_or(0); - let num_offsets = self.layout.as_ref().map(|layout| layout.num_offsets()).unwrap_or(0); - let parsed_values = self - .layout - .as_ref() - .map(|layout| layout.parsed_values()) - .unwrap_or(0); - let total_exceptions = self - .layout - .as_ref() - .map(|layout| layout.total_exceptions()) - .unwrap_or(0); - let total_packed_bytes = self - .layout - .as_ref() - .map(|layout| layout.total_packed_bytes()) - .unwrap_or(0); - let total_exception_bytes = self - .layout - .as_ref() - .map(|layout| layout.total_exception_bytes()) - .unwrap_or(0); - let sum_for = self.layout.as_ref().map(|layout| layout.sum_for_xor()).unwrap_or(0); - let sum_positions = self - .layout - .as_ref() - .map(|layout| layout.sum_positions()) - .unwrap_or(0); - - let decoded_checksum = match self.layout.as_ref() { - Some(AlpPageLayoutAny::F32(layout)) => decode_page_exact(layout)? - .into_iter() - .fold(0u64, |acc, v| acc ^ v.to_u64()), - Some(AlpPageLayoutAny::F64(layout)) => decode_page_exact(layout)? - .into_iter() - .fold(0u64, |acc, v| acc ^ v.to_u64()), - None => 0, - }; - - Err(nyi_err!( - "Encoding ALP page layout parsed ({} vectors, {} offsets, {} values, {} exceptions, {} packed bytes, {} exception bytes, for-xor {}, pos-sum {}, decoded-xor {}), value decoding is not implemented", - num_vectors, - num_offsets, - parsed_values, - total_exceptions, - total_packed_bytes, - total_exception_bytes, - sum_for, - sum_positions, - decoded_checksum - )) + fn get(&mut self, buffer: &mut [T::T]) -> Result { + let num_values = buffer.len().min(self.values_left()); + let end = self.values_decoded + num_values; + buffer[..num_values].copy_from_slice(&self.decoded_values[self.values_decoded..end]); + self.values_decoded = end; + Ok(num_values) } fn values_left(&self) -> usize { - self.num_values + self.decoded_values + .len() + .saturating_sub(self.values_decoded) } fn encoding(&self) -> Encoding { @@ -751,8 +760,8 @@ impl Decoder for AlpDecoder { } fn skip(&mut self, num_values: usize) -> Result { - let skipped = num_values.min(self.num_values); - self.num_values -= skipped; + let skipped = num_values.min(self.values_left()); + self.values_decoded += skipped; Ok(skipped) } } diff --git a/parquet/tests/arrow_reader/alp.rs b/parquet/tests/arrow_reader/alp.rs index e92a1ef7797f..a74f32693539 100644 --- a/parquet/tests/arrow_reader/alp.rs +++ b/parquet/tests/arrow_reader/alp.rs @@ -16,10 +16,12 @@ // under the License. use arrow::util::test_util::parquet_test_data; +use arrow_array::cast::as_primitive_array; +use arrow_array::types::Float64Type; use parquet::arrow::arrow_reader::ArrowReaderBuilder; #[test] -fn test_read_single_f64_alp_page_layout() { +fn test_read_single_f64_alp() { let path = std::path::PathBuf::from(parquet_test_data()).join("single_f64_ALP.parquet"); if !path.exists() { eprintln!("Skipping ALP test file not found: {}", path.display()); @@ -29,13 +31,22 @@ fn test_read_single_f64_alp_page_layout() { let file = std::fs::File::open(path).unwrap(); let mut reader = ArrowReaderBuilder::try_new(file).unwrap().build().unwrap(); - let err = loop { + let mut total_rows = 0usize; + let mut first_value = None; + loop { match reader.next() { - Some(Ok(_)) => continue, - Some(Err(err)) => break err, - None => panic!("Expected ALP decode to fail with NYI"), + Some(Ok(batch)) => { + total_rows += batch.num_rows(); + if first_value.is_none() && batch.num_rows() > 0 { + let values = as_primitive_array::(batch.column(0).as_ref()); + first_value = Some(values.value(0)); + } + } + Some(Err(err)) => panic!("Unexpected ALP decode error: {err}"), + None => break, } - }; + } - assert!(err.to_string().contains("Encoding ALP page layout parsed")); + assert_eq!(total_rows, 1024); + assert_eq!(first_value, Some(0.125)); } From 027172fc8961a31e9731d6bb2c1a419509e83bc9 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Wed, 18 Feb 2026 21:17:13 -0500 Subject: [PATCH 07/22] Add more checks and tests --- parquet/src/encodings/decoding/alp.rs | 317 +++++++++++++++++++++++--- 1 file changed, 281 insertions(+), 36 deletions(-) diff --git a/parquet/src/encodings/decoding/alp.rs b/parquet/src/encodings/decoding/alp.rs index 8238da8ae929..66d71245251f 100644 --- a/parquet/src/encodings/decoding/alp.rs +++ b/parquet/src/encodings/decoding/alp.rs @@ -31,6 +31,8 @@ const ALP_VERSION: u8 = 1; const ALP_COMPRESSION_MODE: u8 = 0; const ALP_INTEGER_ENCODING_FOR_BIT_PACK: u8 = 0; const ALP_MAX_LOG_VECTOR_SIZE: u8 = 16; +const ALP_MAX_EXPONENT_F32: u8 = 10; +const ALP_MAX_EXPONENT_F64: u8 = 18; /// Page-level ALP header (version 1, 8 bytes). /// @@ -120,7 +122,6 @@ impl AlpEncodedForVectorInfo { /// /// `packed_values` is a zero-copy range into page body bytes. /// Exception positions/values are copied for straightforward decode handling. -/// Parsed view of one vector. #[derive(Debug)] struct AlpEncodedVectorView { num_elements: u16, @@ -266,11 +267,14 @@ const ALP_NEG_POW10_F64: [f64; 19] = [ trait AlpFloat: Copy + Default { type Exact: AlpExact + FromBytes; - fn decode_value( - signed_encoded: ::Signed, - exponent: u8, - factor: u8, - ) -> Result; + /// Precompute vector-level ALP decimal scale for: + /// `value = encoded * 10^(factor) * 10^(-exponent)`. + /// + /// Preconditions are validated during page parse. + fn decode_scale(exponent: u8, factor: u8) -> Self; + + /// Decode one signed exact integer using a precomputed scale. + fn decode_value(signed_encoded: ::Signed, scale: Self) -> Self; fn from_exact_bits(bits: Self::Exact) -> Self; } @@ -278,18 +282,14 @@ trait AlpFloat: Copy + Default { impl AlpFloat for f32 { type Exact = u32; - fn decode_value(signed_encoded: i32, exponent: u8, factor: u8) -> Result { - let factor_multiplier = ALP_I64_POW10 - .get(factor as usize) - .ok_or_else(|| general_err!("Invalid ALP page: factor {} exceeds max 18", factor))?; - let exponent_multiplier = ALP_NEG_POW10_F32.get(exponent as usize).ok_or_else(|| { - general_err!( - "Invalid ALP page: exponent {} exceeds max {} for f32", - exponent, - ALP_NEG_POW10_F32.len() - 1 - ) - })?; - Ok((signed_encoded as f32) * (*factor_multiplier as f32) * *exponent_multiplier) + fn decode_scale(exponent: u8, factor: u8) -> Self { + debug_assert!(exponent <= ALP_MAX_EXPONENT_F32); + debug_assert!(factor <= exponent); + (ALP_I64_POW10[factor as usize] as f32) * ALP_NEG_POW10_F32[exponent as usize] + } + + fn decode_value(signed_encoded: i32, scale: Self) -> Self { + (signed_encoded as f32) * scale } fn from_exact_bits(bits: Self::Exact) -> Self { @@ -300,18 +300,14 @@ impl AlpFloat for f32 { impl AlpFloat for f64 { type Exact = u64; - fn decode_value(signed_encoded: i64, exponent: u8, factor: u8) -> Result { - let factor_multiplier = ALP_I64_POW10 - .get(factor as usize) - .ok_or_else(|| general_err!("Invalid ALP page: factor {} exceeds max 18", factor))?; - let exponent_multiplier = ALP_NEG_POW10_F64.get(exponent as usize).ok_or_else(|| { - general_err!( - "Invalid ALP page: exponent {} exceeds max {} for f64", - exponent, - ALP_NEG_POW10_F64.len() - 1 - ) - })?; - Ok((signed_encoded as f64) * (*factor_multiplier as f64) * *exponent_multiplier) + fn decode_scale(exponent: u8, factor: u8) -> Self { + debug_assert!(exponent <= ALP_MAX_EXPONENT_F64); + debug_assert!(factor <= exponent); + (ALP_I64_POW10[factor as usize] as f64) * ALP_NEG_POW10_F64[exponent as usize] + } + + fn decode_value(signed_encoded: i64, scale: Self) -> Self { + (signed_encoded as f64) * scale } fn from_exact_bits(bits: Self::Exact) -> Self { @@ -492,6 +488,36 @@ fn parse_vector_view( num_exceptions: u16::from_le_bytes([vector_bytes[2], vector_bytes[3]]), }; + let max_exponent = if Exact::WIDTH == 4 { + ALP_MAX_EXPONENT_F32 + } else { + ALP_MAX_EXPONENT_F64 + }; + + if alp_info.exponent > max_exponent { + return Err(general_err!( + "Invalid ALP page: exponent {} exceeds max {}", + alp_info.exponent, + max_exponent + )); + } + + if alp_info.factor > alp_info.exponent { + return Err(general_err!( + "Invalid ALP page: factor {} exceeds exponent {}", + alp_info.factor, + alp_info.exponent + )); + } + + if alp_info.num_exceptions > num_elements { + return Err(general_err!( + "Invalid ALP page: num_exceptions {} exceeds vector num_elements {}", + alp_info.num_exceptions, + num_elements + )); + } + let for_start = AlpEncodedVectorInfo::STORED_SIZE; let for_end = for_start + Exact::WIDTH; let frame_of_reference = Exact::from_le_slice(&vector_bytes[for_start..for_end]); @@ -533,7 +559,15 @@ fn parse_vector_view( let mut exception_positions = Vec::with_capacity(alp_info.num_exceptions as usize); for chunk in data[positions_start..positions_end].chunks_exact(2) { - exception_positions.push(u16::from_le_bytes([chunk[0], chunk[1]])); + let position = u16::from_le_bytes([chunk[0], chunk[1]]); + if position >= num_elements { + return Err(general_err!( + "Invalid ALP page: exception position {} out of bounds for vector length {}", + position, + num_elements + )); + } + exception_positions.push(position); } let packed_values = @@ -657,14 +691,12 @@ fn decode_vector_values( )?; inverse_for(&mut exact_values, vector.for_info.frame_of_reference); + let scale = Value::decode_scale(vector.alp_info.exponent, vector.alp_info.factor); + let mut out = Vec::with_capacity(vector.num_elements as usize); for exact_value in exact_values { let signed_value = exact_value.reinterpret_as_signed(); - out.push(Value::decode_value( - signed_value, - vector.alp_info.exponent, - vector.alp_info.factor, - )?); + out.push(Value::decode_value(signed_value, scale)); } if vector.exception_positions.len() != vector.exception_values.len() { @@ -705,6 +737,14 @@ fn decode_page_values(layout: &AlpPageLayout) -> Ok(out) } +/// Decoder for ALP-encoded floating-point pages (`f32`/`f64`). +/// +/// Current behavior: +/// - `set_data` parses + validates page layout and eagerly decodes all values. +/// - `get` copies from the decoded buffer into the requested output slice. +/// - `skip` advances the decoded cursor without additional parsing/decoding. +/// +/// This keeps ALP decoding straightforward while functionality is being brought up. pub(crate) struct AlpDecoder { decoded_values: Vec, values_decoded: usize, @@ -792,6 +832,92 @@ mod tests { out } + fn make_vector_u32( + exponent: u8, + factor: u8, + num_exceptions: u16, + frame_of_reference: u32, + bit_width: u8, + packed_values: &[u8], + exception_positions: &[u16], + exception_values: &[u32], + ) -> Vec { + assert_eq!(num_exceptions as usize, exception_positions.len()); + assert_eq!(num_exceptions as usize, exception_values.len()); + + let mut out = Vec::new(); + out.push(exponent); + out.push(factor); + out.extend_from_slice(&num_exceptions.to_le_bytes()); + out.extend_from_slice(&frame_of_reference.to_le_bytes()); + out.push(bit_width); + out.extend_from_slice(packed_values); + for position in exception_positions { + out.extend_from_slice(&position.to_le_bytes()); + } + for value in exception_values { + out.extend_from_slice(&value.to_le_bytes()); + } + out + } + + fn make_vector_u64( + exponent: u8, + factor: u8, + num_exceptions: u16, + frame_of_reference: u64, + bit_width: u8, + packed_values: &[u8], + exception_positions: &[u16], + exception_values: &[u64], + ) -> Vec { + assert_eq!(num_exceptions as usize, exception_positions.len()); + assert_eq!(num_exceptions as usize, exception_values.len()); + + let mut out = Vec::new(); + out.push(exponent); + out.push(factor); + out.extend_from_slice(&num_exceptions.to_le_bytes()); + out.extend_from_slice(&frame_of_reference.to_le_bytes()); + out.push(bit_width); + out.extend_from_slice(packed_values); + for position in exception_positions { + out.extend_from_slice(&position.to_le_bytes()); + } + for value in exception_values { + out.extend_from_slice(&value.to_le_bytes()); + } + out + } + + fn make_page_from_vectors( + log_vector_size: u8, + num_elements: i32, + vectors: &[Vec], + ) -> Vec { + let vector_size = 1usize << log_vector_size; + let expected_num_vectors = if num_elements <= 0 { + 0 + } else { + (num_elements as usize).div_ceil(vector_size) + }; + assert_eq!(vectors.len(), expected_num_vectors); + + let offsets_section_size = vectors.len() * std::mem::size_of::(); + let mut offsets = Vec::with_capacity(vectors.len()); + let mut running_offset = offsets_section_size as u32; + for vector in vectors { + offsets.push(running_offset); + running_offset += vector.len() as u32; + } + + let mut page = make_alp_page_bytes(1, 0, 0, log_vector_size, num_elements, &offsets, 0); + for vector in vectors { + page.extend_from_slice(vector); + } + page + } + #[test] fn test_parse_alp_page_layout_valid() { let data = make_alp_page_bytes(1, 0, 0, 2, 4, &[4], 13); @@ -840,6 +966,88 @@ mod tests { ); } + #[test] + fn test_parse_alp_page_layout_invalid_exponent_f32() { + let vector = make_vector_u32(11, 0, 0, 0, 0, &[], &[], &[]); + let page = make_page_from_vectors(0, 1, &[vector]); + let err = parse_alp_page_layout::(Bytes::from(page)).unwrap_err(); + assert!( + err.to_string() + .contains("Invalid ALP page: exponent 11 exceeds max 10") + ); + } + + #[test] + fn test_parse_alp_page_layout_invalid_factor_f32() { + let vector = make_vector_u32(0, 11, 0, 0, 0, &[], &[], &[]); + let page = make_page_from_vectors(0, 1, &[vector]); + let err = parse_alp_page_layout::(Bytes::from(page)).unwrap_err(); + assert!( + err.to_string() + .contains("Invalid ALP page: factor 11 exceeds exponent 0") + ); + } + + #[test] + fn test_parse_alp_page_layout_factor_exceeds_exponent() { + let vector = make_vector_u32(2, 3, 0, 0, 0, &[], &[], &[]); + let page = make_page_from_vectors(0, 1, &[vector]); + let err = parse_alp_page_layout::(Bytes::from(page)).unwrap_err(); + assert!( + err.to_string() + .contains("Invalid ALP page: factor 3 exceeds exponent 2") + ); + } + + #[test] + fn test_parse_alp_page_layout_invalid_num_exceptions() { + let vector = make_vector_u32(0, 0, 2, 0, 0, &[], &[0, 0], &[0, 0]); + let page = make_page_from_vectors(0, 1, &[vector]); + let err = parse_alp_page_layout::(Bytes::from(page)).unwrap_err(); + assert!( + err.to_string() + .contains("Invalid ALP page: num_exceptions 2 exceeds vector num_elements 1") + ); + } + + #[test] + fn test_parse_alp_page_layout_invalid_exception_position() { + let vector = make_vector_u32(0, 0, 1, 0, 0, &[], &[1], &[123]); + let page = make_page_from_vectors(0, 1, &[vector]); + let err = parse_alp_page_layout::(Bytes::from(page)).unwrap_err(); + assert!( + err.to_string().contains( + "Invalid ALP page: exception position 1 out of bounds for vector length 1" + ) + ); + } + + #[test] + fn test_parse_alp_page_layout_non_monotonic_offsets() { + let data = make_alp_page_bytes(1, 0, 0, 1, 3, &[12, 8], 12); + let err = parse_alp_page_layout::(Bytes::from(data)).unwrap_err(); + assert!( + err.to_string() + .contains("Invalid ALP page: vector offsets are not monotonic at index 0") + ); + } + + #[test] + fn test_parse_alp_page_layout_truncated_vector_data() { + let mut vector = Vec::new(); + vector.push(0); + vector.push(0); + vector.extend_from_slice(&0u16.to_le_bytes()); + vector.extend_from_slice(&10u32.to_le_bytes()); + vector.push(1); + let page = make_page_from_vectors(1, 2, &[vector]); + let err = parse_alp_page_layout::(Bytes::from(page)).unwrap_err(); + assert!( + err.to_string() + .contains("Invalid ALP page: vector data too short") + ); + } + #[test] fn test_parse_alp_page_layout_parses_vector_view_data_only_f64() { let mut vector = Vec::new(); @@ -910,4 +1118,41 @@ mod tests { let decoded = decode_vector_exact(&[0b0010_1100], &vector).unwrap(); assert_eq!(decoded, vec![10, 13, 77]); } + + #[test] + fn test_decode_page_values_f32_no_exceptions() { + let vector = make_vector_u32(0, 0, 0, 10, 2, &[0b1110_0100], &[], &[]); + let page = make_page_from_vectors(2, 4, &[vector]); + let layout = parse_alp_page_layout::(Bytes::from(page)).unwrap(); + let decoded = decode_page_values::(&layout).unwrap(); + assert_eq!(decoded, vec![10.0, 11.0, 12.0, 13.0]); + } + + #[test] + fn test_decode_page_values_f64_multi_vector_with_exceptions() { + let vector0 = make_vector_u64(0, 0, 1, 10, 1, &[0b0000_0010], &[1], &[42.5f64.to_bits()]); + let vector1 = make_vector_u64(0, 0, 0, 7, 0, &[], &[], &[]); + let page = make_page_from_vectors(1, 3, &[vector0, vector1]); + let layout = parse_alp_page_layout::(Bytes::from(page)).unwrap(); + let decoded = decode_page_values::(&layout).unwrap(); + assert_eq!(decoded, vec![10.0, 42.5, 7.0]); + } + + #[test] + fn test_decode_page_values_f32_edge_values_via_exceptions() { + let edge_values = [ + f32::NAN.to_bits(), + (-0.0f32).to_bits(), + f32::INFINITY.to_bits(), + ]; + let vector = make_vector_u32(0, 0, 3, 0, 0, &[], &[0, 1, 2], &edge_values); + let page = make_page_from_vectors(2, 3, &[vector]); + let layout = parse_alp_page_layout::(Bytes::from(page)).unwrap(); + let decoded = decode_page_values::(&layout).unwrap(); + + assert!(decoded[0].is_nan()); + assert_eq!(decoded[1], -0.0); + assert!(decoded[1].is_sign_negative()); + assert_eq!(decoded[2], f32::INFINITY); + } } From 8ee1b7ef61f0a1b0297b3ab1cfce493a7b7c2d92 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 20 Feb 2026 15:29:52 -0500 Subject: [PATCH 08/22] Make decode page lazy + some tests --- parquet/src/encodings/decoding/alp.rs | 212 +++++++++++++------------- 1 file changed, 110 insertions(+), 102 deletions(-) diff --git a/parquet/src/encodings/decoding/alp.rs b/parquet/src/encodings/decoding/alp.rs index 66d71245251f..8c55f2b71d3b 100644 --- a/parquet/src/encodings/decoding/alp.rs +++ b/parquet/src/encodings/decoding/alp.rs @@ -138,7 +138,6 @@ struct AlpEncodedVectorView { struct AlpPageLayout { header: AlpHeader, body: Bytes, - #[allow(dead_code)] offsets: Vec, vectors: Vec>, } @@ -153,7 +152,7 @@ struct AlpPageLayout { /// - FOR stores non-negative deltas optimized for bitpacking. /// - Unsigned arithmetic avoids signed-overflow edge cases in FOR stage. /// - Signed interpretation is applied later during decimal reconstruction. -trait AlpExact: Copy + std::fmt::Debug { +pub(super) trait AlpExact: Copy + std::fmt::Debug { const WIDTH: usize; type Signed: Copy; fn from_le_slice(slice: &[u8]) -> Self; @@ -264,7 +263,7 @@ const ALP_NEG_POW10_F64: [f64; 19] = [ 0.000000000000000001, ]; -trait AlpFloat: Copy + Default { +pub(super) trait AlpFloat: Copy + Default { type Exact: AlpExact + FromBytes; /// Precompute vector-level ALP decimal scale for: @@ -627,57 +626,6 @@ fn inverse_for(deltas: &mut [Exact], frame_of_reference: Exact) } } -/// Patch exception values at their original positions. -#[cfg(test)] -fn patch_exceptions( - decoded: &mut [Exact], - exception_positions: &[u16], - exception_values: &[Exact], -) -> Result<()> { - if exception_positions.len() != exception_values.len() { - return Err(general_err!( - "Invalid ALP page: exception positions ({}) and values ({}) length mismatch", - exception_positions.len(), - exception_values.len() - )); - } - - for (pos, value) in exception_positions.iter().zip(exception_values.iter()) { - let pos = *pos as usize; - if pos >= decoded.len() { - return Err(general_err!( - "Invalid ALP page: exception position {} out of bounds for vector length {}", - pos, - decoded.len() - )); - } - decoded[pos] = *value; - } - - Ok(()) -} - -/// Decode one vector to exact integers: -/// bit-unpack -> inverse FOR -> exception patch. -#[cfg(test)] -fn decode_vector_exact( - body: &[u8], - vector: &AlpEncodedVectorView, -) -> Result> { - let mut decoded = bit_unpack_integers( - &body[vector.packed_values.clone()], - vector.for_info.bit_width, - vector.num_elements, - )?; - inverse_for(&mut decoded, vector.for_info.frame_of_reference); - patch_exceptions( - &mut decoded, - &vector.exception_positions, - &vector.exception_values, - )?; - Ok(decoded) -} - /// Decode one vector into output floating values: /// bit-unpack -> inverse FOR -> decimal decode -> patch exceptions. fn decode_vector_values( @@ -740,30 +688,59 @@ fn decode_page_values(layout: &AlpPageLayout) -> /// Decoder for ALP-encoded floating-point pages (`f32`/`f64`). /// /// Current behavior: -/// - `set_data` parses + validates page layout and eagerly decodes all values. -/// - `get` copies from the decoded buffer into the requested output slice. -/// - `skip` advances the decoded cursor without additional parsing/decoding. -/// -/// This keeps ALP decoding straightforward while functionality is being brought up. -pub(crate) struct AlpDecoder { +/// - `set_data` parses + validates page metadata and stores ALP layout state. +/// - `get` lazily decodes the full page once, then copies from the decoded buffer. +/// - `skip` advances the decoded cursor. +pub(crate) struct AlpDecoder +where + T::T: AlpFloat, + ::Exact: Send, +{ + layout: Option::Exact>>, decoded_values: Vec, - values_decoded: usize, + current_offset: usize, + needs_decode: bool, + num_values: usize, _marker: PhantomData, } -impl AlpDecoder { +impl AlpDecoder +where + T::T: AlpFloat, + ::Exact: Send, +{ pub(crate) fn new() -> Self { Self { + layout: None, decoded_values: Vec::new(), - values_decoded: 0, + current_offset: 0, + needs_decode: false, + num_values: 0, _marker: PhantomData, } } + + fn ensure_decoded(&mut self) -> Result<()> { + if !self.needs_decode { + return Ok(()); + } + + let layout = self.layout.take().ok_or_else(|| { + general_err!("Invalid ALP decoder state: set_data must be called before get/skip") + })?; + + debug_assert_eq!(layout.offsets.len(), layout.vectors.len()); + self.decoded_values = decode_page_values::(&layout)?; + self.needs_decode = false; + + Ok(()) + } } impl Decoder for AlpDecoder where T::T: AlpFloat, + ::Exact: Send, { fn set_data(&mut self, data: Bytes, num_values: usize) -> Result<()> { let layout = parse_alp_page_layout::<::Exact>(data)?; @@ -776,23 +753,30 @@ where )); } - self.decoded_values = decode_page_values::(&layout)?; - self.values_decoded = 0; + self.layout = Some(layout); + self.decoded_values.clear(); + self.current_offset = 0; + self.needs_decode = num_values > 0; + self.num_values = num_values; Ok(()) } fn get(&mut self, buffer: &mut [T::T]) -> Result { - let num_values = buffer.len().min(self.values_left()); - let end = self.values_decoded + num_values; - buffer[..num_values].copy_from_slice(&self.decoded_values[self.values_decoded..end]); - self.values_decoded = end; - Ok(num_values) + let target = buffer.len().min(self.num_values); + if target == 0 { + return Ok(0); + } + + self.ensure_decoded()?; + let end = self.current_offset + target; + buffer[..target].copy_from_slice(&self.decoded_values[self.current_offset..end]); + self.current_offset = end; + self.num_values -= target; + Ok(target) } fn values_left(&self) -> usize { - self.decoded_values - .len() - .saturating_sub(self.values_decoded) + self.num_values } fn encoding(&self) -> Encoding { @@ -800,15 +784,22 @@ where } fn skip(&mut self, num_values: usize) -> Result { - let skipped = num_values.min(self.values_left()); - self.values_decoded += skipped; - Ok(skipped) + let to_skip = num_values.min(self.num_values); + if to_skip == 0 { + return Ok(0); + } + + self.ensure_decoded()?; + self.current_offset += to_skip; + self.num_values -= to_skip; + Ok(to_skip) } } #[cfg(test)] mod tests { use super::*; + use crate::data_type::FloatType; fn make_alp_page_bytes( version: u8, @@ -1088,35 +1079,10 @@ mod tests { } #[test] - fn test_inverse_for_and_patch_exceptions() { + fn test_inverse_for() { let mut decoded = vec![0u32, 3, 2]; inverse_for(&mut decoded, 10); assert_eq!(decoded, vec![10, 13, 12]); - - patch_exceptions(&mut decoded, &[1], &[99]).unwrap(); - assert_eq!(decoded, vec![10, 99, 12]); - } - - #[test] - fn test_decode_vector_exact() { - let vector = AlpEncodedVectorView:: { - num_elements: 3, - alp_info: AlpEncodedVectorInfo { - exponent: 0, - factor: 0, - num_exceptions: 1, - }, - for_info: AlpEncodedForVectorInfo { - frame_of_reference: 10, - bit_width: 2, - }, - packed_values: 0..1, - exception_positions: vec![2], - exception_values: vec![77], - }; - - let decoded = decode_vector_exact(&[0b0010_1100], &vector).unwrap(); - assert_eq!(decoded, vec![10, 13, 77]); } #[test] @@ -1155,4 +1121,46 @@ mod tests { assert!(decoded[1].is_sign_negative()); assert_eq!(decoded[2], f32::INFINITY); } + + #[test] + fn test_alp_decoder_get_across_vectors() { + let vector0 = make_vector_u32(0, 0, 0, 10, 1, &[0b0000_0010], &[], &[]); + let vector1 = make_vector_u32(0, 0, 0, 20, 1, &[0b0000_0010], &[], &[]); + let page = make_page_from_vectors(1, 4, &[vector0, vector1]); + + let mut decoder = AlpDecoder::::new(); + decoder.set_data(Bytes::from(page), 4).unwrap(); + + let mut first = [0.0f32; 3]; + let read = decoder.get(&mut first).unwrap(); + assert_eq!(read, 3); + assert_eq!(first, [10.0, 11.0, 20.0]); + assert_eq!(decoder.values_left(), 1); + + let mut second = [0.0f32; 2]; + let read = decoder.get(&mut second).unwrap(); + assert_eq!(read, 1); + assert_eq!(second[0], 21.0); + assert_eq!(decoder.values_left(), 0); + } + + #[test] + fn test_alp_decoder_skip_across_vectors() { + let vector0 = make_vector_u32(0, 0, 0, 10, 1, &[0b0000_0010], &[], &[]); + let vector1 = make_vector_u32(0, 0, 0, 20, 1, &[0b0000_0010], &[], &[]); + let page = make_page_from_vectors(1, 4, &[vector0, vector1]); + + let mut decoder = AlpDecoder::::new(); + decoder.set_data(Bytes::from(page), 4).unwrap(); + + let skipped = decoder.skip(3).unwrap(); + assert_eq!(skipped, 3); + assert_eq!(decoder.values_left(), 1); + + let mut out = [0.0f32; 1]; + let read = decoder.get(&mut out).unwrap(); + assert_eq!(read, 1); + assert_eq!(out[0], 21.0); + assert_eq!(decoder.values_left(), 0); + } } From 18f885e834a647c6fdbcceb7a7c264975179e41a Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 20 Feb 2026 15:59:10 -0500 Subject: [PATCH 09/22] Find vectors using offsets at decode time, not parse time --- parquet/src/encodings/decoding/alp.rs | 57 ++++++++++++++++----------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/parquet/src/encodings/decoding/alp.rs b/parquet/src/encodings/decoding/alp.rs index 8c55f2b71d3b..189a8aadde04 100644 --- a/parquet/src/encodings/decoding/alp.rs +++ b/parquet/src/encodings/decoding/alp.rs @@ -135,11 +135,10 @@ struct AlpEncodedVectorView { /// Parsed ALP page layout for one exact integer width (`u32` for float pages, /// `u64` for double pages). #[derive(Debug)] -struct AlpPageLayout { +struct AlpPageLayout { header: AlpHeader, body: Bytes, offsets: Vec, - vectors: Vec>, } /// Exact integer type used by FOR reconstruction. @@ -321,7 +320,7 @@ impl AlpFloat for f64 { /// - non-negative `num_elements` /// - offsets bounds + monotonicity /// - per-vector metadata/data section lengths -fn parse_alp_page_layout(data: Bytes) -> Result> { +fn parse_alp_page_layout(data: Bytes) -> Result { let data_ref = data.as_ref(); if data_ref.len() < ALP_HEADER_SIZE { return Err(general_err!( @@ -428,7 +427,6 @@ fn parse_alp_page_layout(data: Bytes) -> Result(data: Bytes) -> Result( - body_ref, - vector_start, - vector_end, - vector_num_elements, - )?); + parse_vector_view::(body_ref, vector_start, vector_end, vector_num_elements)?; } Ok(AlpPageLayout { header, body, offsets, - vectors, }) } @@ -674,12 +666,25 @@ fn decode_vector_values( Ok(out) } -fn decode_page_values(layout: &AlpPageLayout) -> Result> { +fn decode_page_values(layout: &AlpPageLayout) -> Result> { let mut out = Vec::with_capacity(layout.header.num_elements_usize()); - for vector in &layout.vectors { + for (vector_idx, vector_offset) in layout.offsets.iter().enumerate() { + let vector_start = *vector_offset as usize; + let vector_end = if vector_idx + 1 < layout.offsets.len() { + layout.offsets[vector_idx + 1] as usize + } else { + layout.body.len() + }; + let vector_num_elements = layout.header.vector_num_elements(vector_idx); + let vector = parse_vector_view::( + layout.body.as_ref(), + vector_start, + vector_end, + vector_num_elements, + )?; out.extend_from_slice(&decode_vector_values::( layout.body.as_ref(), - vector, + &vector, )?); } Ok(out) @@ -696,7 +701,7 @@ where T::T: AlpFloat, ::Exact: Send, { - layout: Option::Exact>>, + layout: Option, decoded_values: Vec, current_offset: usize, needs_decode: bool, @@ -729,7 +734,6 @@ where general_err!("Invalid ALP decoder state: set_data must be called before get/skip") })?; - debug_assert_eq!(layout.offsets.len(), layout.vectors.len()); self.decoded_values = decode_page_values::(&layout)?; self.needs_decode = false; @@ -1058,12 +1062,21 @@ mod tests { page.extend_from_slice(&vector); let parsed = parse_alp_page_layout::(Bytes::from(page)).unwrap(); - assert_eq!(parsed.vectors.len(), 1); - assert_eq!(parsed.vectors[0].num_elements, 1); - assert_eq!(parsed.vectors[0].alp_info.num_exceptions, 1); - assert_eq!(parsed.vectors[0].for_info.bit_width, 0); - assert_eq!(parsed.vectors[0].exception_positions, vec![0]); - assert_eq!(parsed.vectors[0].exception_values, vec![42.5_f64.to_bits()]); + assert_eq!(parsed.offsets, vec![4]); + let vector_start = parsed.offsets[0] as usize; + let vector_end = parsed.body.len(); + let parsed_vector = parse_vector_view::( + parsed.body.as_ref(), + vector_start, + vector_end, + parsed.header.vector_num_elements(0), + ) + .unwrap(); + assert_eq!(parsed_vector.num_elements, 1); + assert_eq!(parsed_vector.alp_info.num_exceptions, 1); + assert_eq!(parsed_vector.for_info.bit_width, 0); + assert_eq!(parsed_vector.exception_positions, vec![0]); + assert_eq!(parsed_vector.exception_values, vec![42.5_f64.to_bits()]); } #[test] From a9fa0c30e53fc80b07083ad3c056316c78ed1b51 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 20 Feb 2026 16:25:18 -0500 Subject: [PATCH 10/22] get decode fast path --- parquet/src/encodings/decoding/alp.rs | 88 ++++++++++++++++++++++++--- 1 file changed, 81 insertions(+), 7 deletions(-) diff --git a/parquet/src/encodings/decoding/alp.rs b/parquet/src/encodings/decoding/alp.rs index 189a8aadde04..87b63a6962ce 100644 --- a/parquet/src/encodings/decoding/alp.rs +++ b/parquet/src/encodings/decoding/alp.rs @@ -667,7 +667,31 @@ fn decode_vector_values( } fn decode_page_values(layout: &AlpPageLayout) -> Result> { - let mut out = Vec::with_capacity(layout.header.num_elements_usize()); + let total = layout.header.num_elements_usize(); + let mut out = vec![Value::default(); total]; + let written = decode_page_values_into::(layout, &mut out)?; + debug_assert_eq!(written, total); + Ok(out) +} + +/// Decode a full ALP page into an existing output slice. +/// +/// This walks vectors using `layout.offsets` (C++-style offset-based decode flow), +/// reparsing each vector view from the page body and decoding it into `out`. +fn decode_page_values_into( + layout: &AlpPageLayout, + out: &mut [Value], +) -> Result { + let total = layout.header.num_elements_usize(); + if out.len() < total { + return Err(general_err!( + "Invalid ALP decode output: output length {} smaller than page values {}", + out.len(), + total + )); + } + + let mut output_offset = 0usize; for (vector_idx, vector_offset) in layout.offsets.iter().enumerate() { let vector_start = *vector_offset as usize; let vector_end = if vector_idx + 1 < layout.offsets.len() { @@ -682,19 +706,20 @@ fn decode_page_values(layout: &AlpPageLayout) -> Result( - layout.body.as_ref(), - &vector, - )?); + let vector_values = decode_vector_values::(layout.body.as_ref(), &vector)?; + let next_offset = output_offset + vector_values.len(); + out[output_offset..next_offset].copy_from_slice(&vector_values); + output_offset = next_offset; } - Ok(out) + Ok(output_offset) } /// Decoder for ALP-encoded floating-point pages (`f32`/`f64`). /// /// Current behavior: /// - `set_data` parses + validates page metadata and stores ALP layout state. -/// - `get` lazily decodes the full page once, then copies from the decoded buffer. +/// - `get` uses a fast path to decode directly to output when all values are requested. +/// - otherwise, `get` lazily decodes the full page once, then copies from decoded buffer. /// - `skip` advances the decoded cursor. pub(crate) struct AlpDecoder where @@ -725,6 +750,9 @@ where } } + /// Decode the stored page into `decoded_values` if it hasn't been decoded yet. + /// + /// Used by partial `get` / `skip` paths that need a stable decoded buffer. fn ensure_decoded(&mut self) -> Result<()> { if !self.needs_decode { return Ok(()); @@ -746,6 +774,9 @@ where T::T: AlpFloat, ::Exact: Send, { + /// Store validated page layout and reset read cursor state. + /// + /// Actual value decoding is deferred until first `get`/`skip`. fn set_data(&mut self, data: Bytes, num_values: usize) -> Result<()> { let layout = parse_alp_page_layout::<::Exact>(data)?; @@ -765,12 +796,32 @@ where Ok(()) } + /// Read up to `buffer.len()` decoded values. + /// + /// Fast path: if caller requests all remaining values from a fresh page, + /// decode directly into `buffer` (matching C++ ALP decoder behavior). + /// Otherwise decode once into internal storage and copy slices from there. fn get(&mut self, buffer: &mut [T::T]) -> Result { let target = buffer.len().min(self.num_values); if target == 0 { return Ok(0); } + // C++ parity fast path: decode directly into caller output when it asks + // for all remaining values from a fresh page. + if self.needs_decode && target == self.num_values { + let layout = self.layout.take().ok_or_else(|| { + general_err!("Invalid ALP decoder state: set_data must be called before get/skip") + })?; + let written = decode_page_values_into::(&layout, &mut buffer[..target])?; + self.needs_decode = false; + self.decoded_values.clear(); + self.current_offset = 0; + self.num_values = 0; + debug_assert_eq!(written, target); + return Ok(written); + } + self.ensure_decoded()?; let end = self.current_offset + target; buffer[..target].copy_from_slice(&self.decoded_values[self.current_offset..end]); @@ -787,6 +838,10 @@ where Encoding::ALP } + /// Skip up to `num_values` decoded values. + /// + /// For parity with C++ partial-read behavior, this may trigger deferred + /// page decoding before advancing the cursor. fn skip(&mut self, num_values: usize) -> Result { let to_skip = num_values.min(self.num_values); if to_skip == 0 { @@ -1176,4 +1231,23 @@ mod tests { assert_eq!(out[0], 21.0); assert_eq!(decoder.values_left(), 0); } + + #[test] + fn test_alp_decoder_get_fast_path_full_read() { + let vector0 = make_vector_u32(0, 0, 0, 10, 1, &[0b0000_0010], &[], &[]); + let vector1 = make_vector_u32(0, 0, 0, 20, 1, &[0b0000_0010], &[], &[]); + let page = make_page_from_vectors(1, 4, &[vector0, vector1]); + + let mut decoder = AlpDecoder::::new(); + decoder.set_data(Bytes::from(page), 4).unwrap(); + + let mut out = [0.0f32; 4]; + let read = decoder.get(&mut out).unwrap(); + assert_eq!(read, 4); + assert_eq!(out, [10.0, 11.0, 20.0, 21.0]); + assert_eq!(decoder.values_left(), 0); + assert!(!decoder.needs_decode); + assert!(decoder.decoded_values.is_empty()); + assert!(decoder.layout.is_none()); + } } From 7754ebca92c46eab5fa2217c156fdf8386181f3d Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 20 Feb 2026 17:01:57 -0500 Subject: [PATCH 11/22] Simplify decode_page_values --- parquet/src/encodings/decoding/alp.rs | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/parquet/src/encodings/decoding/alp.rs b/parquet/src/encodings/decoding/alp.rs index 87b63a6962ce..be756354941a 100644 --- a/parquet/src/encodings/decoding/alp.rs +++ b/parquet/src/encodings/decoding/alp.rs @@ -669,8 +669,7 @@ fn decode_vector_values( fn decode_page_values(layout: &AlpPageLayout) -> Result> { let total = layout.header.num_elements_usize(); let mut out = vec![Value::default(); total]; - let written = decode_page_values_into::(layout, &mut out)?; - debug_assert_eq!(written, total); + decode_page_values_into::(layout, &mut out)?; Ok(out) } @@ -681,7 +680,7 @@ fn decode_page_values(layout: &AlpPageLayout) -> Result( layout: &AlpPageLayout, out: &mut [Value], -) -> Result { +) -> Result<()> { let total = layout.header.num_elements_usize(); if out.len() < total { return Err(general_err!( @@ -711,7 +710,14 @@ fn decode_page_values_into( out[output_offset..next_offset].copy_from_slice(&vector_values); output_offset = next_offset; } - Ok(output_offset) + if output_offset != total { + return Err(general_err!( + "Invalid ALP decode output: decoded {} values, expected {}", + output_offset, + total + )); + } + Ok(()) } /// Decoder for ALP-encoded floating-point pages (`f32`/`f64`). @@ -813,13 +819,12 @@ where let layout = self.layout.take().ok_or_else(|| { general_err!("Invalid ALP decoder state: set_data must be called before get/skip") })?; - let written = decode_page_values_into::(&layout, &mut buffer[..target])?; + decode_page_values_into::(&layout, &mut buffer[..target])?; self.needs_decode = false; self.decoded_values.clear(); self.current_offset = 0; self.num_values = 0; - debug_assert_eq!(written, target); - return Ok(written); + return Ok(target); } self.ensure_decoded()?; From 6b7e923ede8b3e9742a5488a2a26aea9c5c32535 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Thu, 12 Mar 2026 14:09:38 -0400 Subject: [PATCH 12/22] remove `version` from page header --- parquet/src/encodings/decoding/alp.rs | 53 ++++++++++----------------- 1 file changed, 19 insertions(+), 34 deletions(-) diff --git a/parquet/src/encodings/decoding/alp.rs b/parquet/src/encodings/decoding/alp.rs index be756354941a..6d8cdb4fdb8c 100644 --- a/parquet/src/encodings/decoding/alp.rs +++ b/parquet/src/encodings/decoding/alp.rs @@ -26,25 +26,22 @@ use crate::encodings::decoding::Decoder; use crate::errors::{ParquetError, Result}; use crate::util::bit_util::{BitReader, FromBytes}; -const ALP_HEADER_SIZE: usize = 8; -const ALP_VERSION: u8 = 1; +const ALP_HEADER_SIZE: usize = 7; const ALP_COMPRESSION_MODE: u8 = 0; const ALP_INTEGER_ENCODING_FOR_BIT_PACK: u8 = 0; const ALP_MAX_LOG_VECTOR_SIZE: u8 = 16; const ALP_MAX_EXPONENT_F32: u8 = 10; const ALP_MAX_EXPONENT_F64: u8 = 18; -/// Page-level ALP header (version 1, 8 bytes). +/// Page-level ALP header (7 bytes). /// /// Layout in bytes: -/// - `[0]` `version` -/// - `[1]` `compression_mode` -/// - `[2]` `integer_encoding` -/// - `[3]` `log_vector_size` -/// - `[4..8]` `num_elements` (little-endian `i32`) +/// - `[0]` `compression_mode` +/// - `[1]` `integer_encoding` +/// - `[2]` `log_vector_size` +/// - `[3..7]` `num_elements` (little-endian `i32`) #[derive(Debug, Clone, Copy)] struct AlpHeader { - version: u8, compression_mode: u8, integer_encoding: u8, log_vector_size: u8, @@ -316,7 +313,7 @@ impl AlpFloat for f64 { /// Parse and validate a full ALP-encoded page body. /// /// Validation includes: -/// - header fields/version/encoding +/// - header fields/encoding /// - non-negative `num_elements` /// - offsets bounds + monotonicity /// - per-vector metadata/data section lengths @@ -331,21 +328,12 @@ fn parse_alp_page_layout(data: Bytes) -> Result } let header = AlpHeader { - version: data_ref[0], - compression_mode: data_ref[1], - integer_encoding: data_ref[2], - log_vector_size: data_ref[3], - num_elements: i32::from_le_bytes([data_ref[4], data_ref[5], data_ref[6], data_ref[7]]), + compression_mode: data_ref[0], + integer_encoding: data_ref[1], + log_vector_size: data_ref[2], + num_elements: i32::from_le_bytes([data_ref[3], data_ref[4], data_ref[5], data_ref[6]]), }; - if header.version != ALP_VERSION { - return Err(general_err!( - "Invalid ALP page: unsupported version {}, expected {}", - header.version, - ALP_VERSION - )); - } - if header.compression_mode != ALP_COMPRESSION_MODE { return Err(general_err!( "Invalid ALP page: unsupported compression mode {}", @@ -866,7 +854,6 @@ mod tests { use crate::data_type::FloatType; fn make_alp_page_bytes( - version: u8, compression_mode: u8, integer_encoding: u8, log_vector_size: u8, @@ -875,7 +862,6 @@ mod tests { body_tail_len: usize, ) -> Vec { let mut out = Vec::with_capacity(ALP_HEADER_SIZE + offsets.len() * 4 + body_tail_len); - out.push(version); out.push(compression_mode); out.push(integer_encoding); out.push(log_vector_size); @@ -966,7 +952,7 @@ mod tests { running_offset += vector.len() as u32; } - let mut page = make_alp_page_bytes(1, 0, 0, log_vector_size, num_elements, &offsets, 0); + let mut page = make_alp_page_bytes(0, 0, log_vector_size, num_elements, &offsets, 0); for vector in vectors { page.extend_from_slice(vector); } @@ -975,9 +961,8 @@ mod tests { #[test] fn test_parse_alp_page_layout_valid() { - let data = make_alp_page_bytes(1, 0, 0, 2, 4, &[4], 13); + let data = make_alp_page_bytes(0, 0, 2, 4, &[4], 13); let parsed = parse_alp_page_layout::(Bytes::from(data)).unwrap(); - assert_eq!(parsed.header.version, 1); assert_eq!(parsed.header.num_elements, 4); assert_eq!(parsed.offsets, vec![4]); } @@ -987,13 +972,13 @@ mod tests { let err = parse_alp_page_layout::(Bytes::from_static(&[0, 1, 2])).unwrap_err(); assert!( err.to_string() - .contains("Invalid ALP page: expected at least 8 bytes for header") + .contains("Invalid ALP page: expected at least 7 bytes for header") ); } #[test] fn test_parse_alp_page_layout_invalid_log_vector_size() { - let data = make_alp_page_bytes(1, 0, 0, 17, 1, &[4], 8); + let data = make_alp_page_bytes(0, 0, 17, 1, &[4], 8); let err = parse_alp_page_layout::(Bytes::from(data)).unwrap_err(); assert!( err.to_string() @@ -1003,7 +988,7 @@ mod tests { #[test] fn test_parse_alp_page_layout_invalid_integer_encoding() { - let data = make_alp_page_bytes(1, 0, 1, 2, 1, &[4], 8); + let data = make_alp_page_bytes(0, 1, 2, 1, &[4], 8); let err = parse_alp_page_layout::(Bytes::from(data)).unwrap_err(); assert!( err.to_string() @@ -1013,7 +998,7 @@ mod tests { #[test] fn test_parse_alp_page_layout_negative_num_elements() { - let data = make_alp_page_bytes(1, 0, 0, 2, -1, &[4], 8); + let data = make_alp_page_bytes(0, 0, 2, -1, &[4], 8); let err = parse_alp_page_layout::(Bytes::from(data)).unwrap_err(); assert!( err.to_string() @@ -1079,7 +1064,7 @@ mod tests { #[test] fn test_parse_alp_page_layout_non_monotonic_offsets() { - let data = make_alp_page_bytes(1, 0, 0, 1, 3, &[12, 8], 12); + let data = make_alp_page_bytes(0, 0, 1, 3, &[12, 8], 12); let err = parse_alp_page_layout::(Bytes::from(data)).unwrap_err(); assert!( err.to_string() @@ -1118,7 +1103,7 @@ mod tests { vector.extend_from_slice(&42.5_f64.to_le_bytes()); let offsets = [4u32]; - let mut page = make_alp_page_bytes(1, 0, 0, 0, 1, &offsets, 0); + let mut page = make_alp_page_bytes(0, 0, 0, 1, &offsets, 0); page.extend_from_slice(&vector); let parsed = parse_alp_page_layout::(Bytes::from(page)).unwrap(); From a13f24425646025d2e6f86714d106b00108f6111 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Thu, 12 Mar 2026 14:34:56 -0400 Subject: [PATCH 13/22] change `log_vetctor_size` range consts --- parquet/src/encodings/decoding/alp.rs | 108 ++++++++++++++++---------- 1 file changed, 69 insertions(+), 39 deletions(-) diff --git a/parquet/src/encodings/decoding/alp.rs b/parquet/src/encodings/decoding/alp.rs index 6d8cdb4fdb8c..94b53ca03474 100644 --- a/parquet/src/encodings/decoding/alp.rs +++ b/parquet/src/encodings/decoding/alp.rs @@ -29,7 +29,8 @@ use crate::util::bit_util::{BitReader, FromBytes}; const ALP_HEADER_SIZE: usize = 7; const ALP_COMPRESSION_MODE: u8 = 0; const ALP_INTEGER_ENCODING_FOR_BIT_PACK: u8 = 0; -const ALP_MAX_LOG_VECTOR_SIZE: u8 = 16; +const ALP_MIN_LOG_VECTOR_SIZE: u8 = 3; +const ALP_MAX_LOG_VECTOR_SIZE: u8 = 15; const ALP_MAX_EXPONENT_F32: u8 = 10; const ALP_MAX_EXPONENT_F64: u8 = 18; @@ -348,6 +349,14 @@ fn parse_alp_page_layout(data: Bytes) -> Result )); } + if header.log_vector_size < ALP_MIN_LOG_VECTOR_SIZE { + return Err(general_err!( + "Invalid ALP page: log_vector_size {} below min {}", + header.log_vector_size, + ALP_MIN_LOG_VECTOR_SIZE + )); + } + if header.log_vector_size > ALP_MAX_LOG_VECTOR_SIZE { return Err(general_err!( "Invalid ALP page: log_vector_size {} exceeds max {}", @@ -961,7 +970,7 @@ mod tests { #[test] fn test_parse_alp_page_layout_valid() { - let data = make_alp_page_bytes(0, 0, 2, 4, &[4], 13); + let data = make_alp_page_bytes(0, 0, 3, 4, &[4], 13); let parsed = parse_alp_page_layout::(Bytes::from(data)).unwrap(); assert_eq!(parsed.header.num_elements, 4); assert_eq!(parsed.offsets, vec![4]); @@ -977,18 +986,28 @@ mod tests { } #[test] - fn test_parse_alp_page_layout_invalid_log_vector_size() { - let data = make_alp_page_bytes(0, 0, 17, 1, &[4], 8); + fn test_parse_alp_page_layout_too_small_log_vector_size() { + let data = make_alp_page_bytes(0, 0, 2, 1, &[4], 8); let err = parse_alp_page_layout::(Bytes::from(data)).unwrap_err(); assert!( err.to_string() - .contains("Invalid ALP page: log_vector_size 17 exceeds max 16") + .contains("Invalid ALP page: log_vector_size 2 below min 3") + ); + } + + #[test] + fn test_parse_alp_page_layout_too_big_log_vector_size() { + let data = make_alp_page_bytes(0, 0, 16, 1, &[4], 8); + let err = parse_alp_page_layout::(Bytes::from(data)).unwrap_err(); + assert!( + err.to_string() + .contains("Invalid ALP page: log_vector_size 16 exceeds max 15") ); } #[test] fn test_parse_alp_page_layout_invalid_integer_encoding() { - let data = make_alp_page_bytes(0, 1, 2, 1, &[4], 8); + let data = make_alp_page_bytes(0, 1, 3, 1, &[4], 8); let err = parse_alp_page_layout::(Bytes::from(data)).unwrap_err(); assert!( err.to_string() @@ -998,7 +1017,7 @@ mod tests { #[test] fn test_parse_alp_page_layout_negative_num_elements() { - let data = make_alp_page_bytes(0, 0, 2, -1, &[4], 8); + let data = make_alp_page_bytes(0, 0, 3, -1, &[4], 8); let err = parse_alp_page_layout::(Bytes::from(data)).unwrap_err(); assert!( err.to_string() @@ -1009,7 +1028,7 @@ mod tests { #[test] fn test_parse_alp_page_layout_invalid_exponent_f32() { let vector = make_vector_u32(11, 0, 0, 0, 0, &[], &[], &[]); - let page = make_page_from_vectors(0, 1, &[vector]); + let page = make_page_from_vectors(3, 1, &[vector]); let err = parse_alp_page_layout::(Bytes::from(page)).unwrap_err(); assert!( err.to_string() @@ -1020,7 +1039,7 @@ mod tests { #[test] fn test_parse_alp_page_layout_invalid_factor_f32() { let vector = make_vector_u32(0, 11, 0, 0, 0, &[], &[], &[]); - let page = make_page_from_vectors(0, 1, &[vector]); + let page = make_page_from_vectors(3, 1, &[vector]); let err = parse_alp_page_layout::(Bytes::from(page)).unwrap_err(); assert!( err.to_string() @@ -1031,7 +1050,7 @@ mod tests { #[test] fn test_parse_alp_page_layout_factor_exceeds_exponent() { let vector = make_vector_u32(2, 3, 0, 0, 0, &[], &[], &[]); - let page = make_page_from_vectors(0, 1, &[vector]); + let page = make_page_from_vectors(3, 1, &[vector]); let err = parse_alp_page_layout::(Bytes::from(page)).unwrap_err(); assert!( err.to_string() @@ -1042,7 +1061,7 @@ mod tests { #[test] fn test_parse_alp_page_layout_invalid_num_exceptions() { let vector = make_vector_u32(0, 0, 2, 0, 0, &[], &[0, 0], &[0, 0]); - let page = make_page_from_vectors(0, 1, &[vector]); + let page = make_page_from_vectors(3, 1, &[vector]); let err = parse_alp_page_layout::(Bytes::from(page)).unwrap_err(); assert!( err.to_string() @@ -1053,7 +1072,7 @@ mod tests { #[test] fn test_parse_alp_page_layout_invalid_exception_position() { let vector = make_vector_u32(0, 0, 1, 0, 0, &[], &[1], &[123]); - let page = make_page_from_vectors(0, 1, &[vector]); + let page = make_page_from_vectors(3, 1, &[vector]); let err = parse_alp_page_layout::(Bytes::from(page)).unwrap_err(); assert!( err.to_string().contains( @@ -1064,7 +1083,7 @@ mod tests { #[test] fn test_parse_alp_page_layout_non_monotonic_offsets() { - let data = make_alp_page_bytes(0, 0, 1, 3, &[12, 8], 12); + let data = make_alp_page_bytes(0, 0, 3, 9, &[12, 8], 12); let err = parse_alp_page_layout::(Bytes::from(data)).unwrap_err(); assert!( err.to_string() @@ -1080,7 +1099,7 @@ mod tests { vector.extend_from_slice(&0u16.to_le_bytes()); vector.extend_from_slice(&10u32.to_le_bytes()); vector.push(1); - let page = make_page_from_vectors(1, 2, &[vector]); + let page = make_page_from_vectors(3, 2, &[vector]); let err = parse_alp_page_layout::(Bytes::from(page)).unwrap_err(); assert!( err.to_string() @@ -1103,7 +1122,7 @@ mod tests { vector.extend_from_slice(&42.5_f64.to_le_bytes()); let offsets = [4u32]; - let mut page = make_alp_page_bytes(0, 0, 0, 1, &offsets, 0); + let mut page = make_alp_page_bytes(0, 0, 3, 1, &offsets, 0); page.extend_from_slice(&vector); let parsed = parse_alp_page_layout::(Bytes::from(page)).unwrap(); @@ -1146,7 +1165,7 @@ mod tests { #[test] fn test_decode_page_values_f32_no_exceptions() { let vector = make_vector_u32(0, 0, 0, 10, 2, &[0b1110_0100], &[], &[]); - let page = make_page_from_vectors(2, 4, &[vector]); + let page = make_page_from_vectors(3, 4, &[vector]); let layout = parse_alp_page_layout::(Bytes::from(page)).unwrap(); let decoded = decode_page_values::(&layout).unwrap(); assert_eq!(decoded, vec![10.0, 11.0, 12.0, 13.0]); @@ -1156,10 +1175,13 @@ mod tests { fn test_decode_page_values_f64_multi_vector_with_exceptions() { let vector0 = make_vector_u64(0, 0, 1, 10, 1, &[0b0000_0010], &[1], &[42.5f64.to_bits()]); let vector1 = make_vector_u64(0, 0, 0, 7, 0, &[], &[], &[]); - let page = make_page_from_vectors(1, 3, &[vector0, vector1]); + let page = make_page_from_vectors(3, 9, &[vector0, vector1]); let layout = parse_alp_page_layout::(Bytes::from(page)).unwrap(); let decoded = decode_page_values::(&layout).unwrap(); - assert_eq!(decoded, vec![10.0, 42.5, 7.0]); + assert_eq!( + decoded, + vec![10.0, 42.5, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 7.0] + ); } #[test] @@ -1170,7 +1192,7 @@ mod tests { f32::INFINITY.to_bits(), ]; let vector = make_vector_u32(0, 0, 3, 0, 0, &[], &[0, 1, 2], &edge_values); - let page = make_page_from_vectors(2, 3, &[vector]); + let page = make_page_from_vectors(3, 3, &[vector]); let layout = parse_alp_page_layout::(Bytes::from(page)).unwrap(); let decoded = decode_page_values::(&layout).unwrap(); @@ -1184,21 +1206,24 @@ mod tests { fn test_alp_decoder_get_across_vectors() { let vector0 = make_vector_u32(0, 0, 0, 10, 1, &[0b0000_0010], &[], &[]); let vector1 = make_vector_u32(0, 0, 0, 20, 1, &[0b0000_0010], &[], &[]); - let page = make_page_from_vectors(1, 4, &[vector0, vector1]); + let page = make_page_from_vectors(3, 12, &[vector0, vector1]); let mut decoder = AlpDecoder::::new(); - decoder.set_data(Bytes::from(page), 4).unwrap(); + decoder.set_data(Bytes::from(page), 12).unwrap(); - let mut first = [0.0f32; 3]; + let mut first = [0.0f32; 9]; let read = decoder.get(&mut first).unwrap(); - assert_eq!(read, 3); - assert_eq!(first, [10.0, 11.0, 20.0]); - assert_eq!(decoder.values_left(), 1); + assert_eq!(read, 9); + assert_eq!( + first, + [10.0, 11.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 20.0] + ); + assert_eq!(decoder.values_left(), 3); - let mut second = [0.0f32; 2]; + let mut second = [0.0f32; 3]; let read = decoder.get(&mut second).unwrap(); - assert_eq!(read, 1); - assert_eq!(second[0], 21.0); + assert_eq!(read, 3); + assert_eq!(second, [21.0, 20.0, 20.0]); assert_eq!(decoder.values_left(), 0); } @@ -1206,35 +1231,40 @@ mod tests { fn test_alp_decoder_skip_across_vectors() { let vector0 = make_vector_u32(0, 0, 0, 10, 1, &[0b0000_0010], &[], &[]); let vector1 = make_vector_u32(0, 0, 0, 20, 1, &[0b0000_0010], &[], &[]); - let page = make_page_from_vectors(1, 4, &[vector0, vector1]); + let page = make_page_from_vectors(3, 12, &[vector0, vector1]); let mut decoder = AlpDecoder::::new(); - decoder.set_data(Bytes::from(page), 4).unwrap(); + decoder.set_data(Bytes::from(page), 12).unwrap(); - let skipped = decoder.skip(3).unwrap(); - assert_eq!(skipped, 3); - assert_eq!(decoder.values_left(), 1); + let skipped = decoder.skip(9).unwrap(); + assert_eq!(skipped, 9); + assert_eq!(decoder.values_left(), 3); let mut out = [0.0f32; 1]; let read = decoder.get(&mut out).unwrap(); assert_eq!(read, 1); assert_eq!(out[0], 21.0); - assert_eq!(decoder.values_left(), 0); + assert_eq!(decoder.values_left(), 2); } #[test] fn test_alp_decoder_get_fast_path_full_read() { let vector0 = make_vector_u32(0, 0, 0, 10, 1, &[0b0000_0010], &[], &[]); let vector1 = make_vector_u32(0, 0, 0, 20, 1, &[0b0000_0010], &[], &[]); - let page = make_page_from_vectors(1, 4, &[vector0, vector1]); + let page = make_page_from_vectors(3, 12, &[vector0, vector1]); let mut decoder = AlpDecoder::::new(); - decoder.set_data(Bytes::from(page), 4).unwrap(); + decoder.set_data(Bytes::from(page), 12).unwrap(); - let mut out = [0.0f32; 4]; + let mut out = [0.0f32; 12]; let read = decoder.get(&mut out).unwrap(); - assert_eq!(read, 4); - assert_eq!(out, [10.0, 11.0, 20.0, 21.0]); + assert_eq!(read, 12); + assert_eq!( + out, + [ + 10.0, 11.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 20.0, 21.0, 20.0, 20.0 + ] + ); assert_eq!(decoder.values_left(), 0); assert!(!decoder.needs_decode); assert!(decoder.decoded_values.is_empty()); From 6e86d6f2574b49ee25d0a84e8d0a15b32bed9e47 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Thu, 12 Mar 2026 16:08:26 -0400 Subject: [PATCH 14/22] redo the integration test using a test file from the c++ pr --- parquet-testing | 2 +- parquet/tests/arrow_reader/alp.rs | 129 +++++++++++++++++++++++++----- 2 files changed, 108 insertions(+), 23 deletions(-) diff --git a/parquet-testing b/parquet-testing index abd2478aea41..b49107d5e742 160000 --- a/parquet-testing +++ b/parquet-testing @@ -1 +1 @@ -Subproject commit abd2478aea41d6cb10352b2277a7dc67ae20a02a +Subproject commit b49107d5e7426ccd657965dbf85fcac430812d64 diff --git a/parquet/tests/arrow_reader/alp.rs b/parquet/tests/arrow_reader/alp.rs index a74f32693539..93f36c1aaffb 100644 --- a/parquet/tests/arrow_reader/alp.rs +++ b/parquet/tests/arrow_reader/alp.rs @@ -15,38 +15,123 @@ // specific language governing permissions and limitations // under the License. +use arrow::compute::concat_batches; use arrow::util::test_util::parquet_test_data; use arrow_array::cast::as_primitive_array; -use arrow_array::types::Float64Type; +use arrow_array::types::Float32Type; +use arrow_array::{Array, ArrayRef, Float32Array, RecordBatch}; +use arrow_schema::{DataType, Field, Schema}; use parquet::arrow::arrow_reader::ArrowReaderBuilder; +use std::fs::File; +use std::io::{BufRead, BufReader}; +use std::path::PathBuf; +use std::sync::Arc; #[test] -fn test_read_single_f64_alp() { - let path = std::path::PathBuf::from(parquet_test_data()).join("single_f64_ALP.parquet"); - if !path.exists() { - eprintln!("Skipping ALP test file not found: {}", path.display()); +fn test_read_f32_alp() { + let data_dir = PathBuf::from(parquet_test_data()); + let parquet_path = data_dir.join("alp_float_arade.parquet"); + let expected_csv_path = data_dir.join("alp_arade_expect.csv"); + if !parquet_path.exists() || !expected_csv_path.exists() { + eprintln!("Skipping ALP test files not found"); return; } - let file = std::fs::File::open(path).unwrap(); - let mut reader = ArrowReaderBuilder::try_new(file).unwrap().build().unwrap(); - - let mut total_rows = 0usize; - let mut first_value = None; - loop { - match reader.next() { - Some(Ok(batch)) => { - total_rows += batch.num_rows(); - if first_value.is_none() && batch.num_rows() > 0 { - let values = as_primitive_array::(batch.column(0).as_ref()); - first_value = Some(values.value(0)); - } + let expected = read_expected_csv_batch(&expected_csv_path); + let actual = read_parquet_batch(&parquet_path); + + assert_eq!(actual.schema(), expected.schema(), "schema mismatch"); + assert_eq!( + actual.num_columns(), + expected.num_columns(), + "column mismatch" + ); + assert_eq!(actual.num_rows(), expected.num_rows(), "row count mismatch"); + + for col_idx in 0..actual.num_columns() { + let col_name = actual.schema().field(col_idx).name().clone(); + let actual_col = as_primitive_array::(actual.column(col_idx).as_ref()); + let expected_col = as_primitive_array::(expected.column(col_idx).as_ref()); + + for row_idx in 0..actual.num_rows() { + assert_eq!( + actual_col.is_valid(row_idx), + expected_col.is_valid(row_idx), + "null mismatch at column {col_name} row {row_idx}" + ); + if actual_col.is_valid(row_idx) { + let actual_value = actual_col.value(row_idx); + let expected_value = expected_col.value(row_idx); + assert!( + actual_value.to_bits() == expected_value.to_bits(), + "bit mismatch at column {col_name} row {row_idx}: expected={expected_value} actual={actual_value}" + ); } - Some(Err(err)) => panic!("Unexpected ALP decode error: {err}"), - None => break, } } +} + +fn alp_schema() -> Arc { + Arc::new(Schema::new(vec![ + Field::new("value1", DataType::Float32, true), + Field::new("value2", DataType::Float32, true), + Field::new("value3", DataType::Float32, true), + Field::new("value4", DataType::Float32, true), + ])) +} + +fn read_parquet_batch(path: &PathBuf) -> RecordBatch { + let file = File::open(path).unwrap(); + let reader = ArrowReaderBuilder::try_new(file).unwrap().build().unwrap(); + let mut batches = Vec::new(); + for batch in reader { + batches.push(batch.unwrap()); + } + assert!(!batches.is_empty(), "expected non-empty parquet batch set"); + concat_batches(batches[0].schema_ref(), &batches).unwrap() +} + +fn read_expected_csv_batch(path: &PathBuf) -> RecordBatch { + let file = File::open(path).unwrap(); + let mut lines = BufReader::new(file).lines(); + + let header = lines.next().expect("expected csv header line").unwrap(); + assert_eq!( + header.trim(), + "value1,value2,value3,value4", + "unexpected csv header" + ); + + let mut c0 = Vec::new(); + let mut c1 = Vec::new(); + let mut c2 = Vec::new(); + let mut c3 = Vec::new(); + for (line_idx, line) in lines.enumerate() { + let line = line.unwrap(); + if line.trim().is_empty() { + continue; + } + let values: Vec<_> = line.split(',').map(str::trim).collect(); + assert_eq!( + values.len(), + 4, + "wrong csv column count at line {}", + line_idx + 2 + ); + c0.push(values[0].parse::().unwrap()); + c1.push(values[1].parse::().unwrap()); + c2.push(values[2].parse::().unwrap()); + c3.push(values[3].parse::().unwrap()); + } - assert_eq!(total_rows, 1024); - assert_eq!(first_value, Some(0.125)); + RecordBatch::try_new( + alp_schema(), + vec![ + Arc::new(Float32Array::from(c0)) as ArrayRef, + Arc::new(Float32Array::from(c1)) as ArrayRef, + Arc::new(Float32Array::from(c2)) as ArrayRef, + Arc::new(Float32Array::from(c3)) as ArrayRef, + ], + ) + .unwrap() } From ac56cc28c4c5df2989f09dbe7ff936e576f7fcc0 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Thu, 12 Mar 2026 16:23:16 -0400 Subject: [PATCH 15/22] cache parsed vectors --- parquet/src/encodings/decoding/alp.rs | 54 +++++++++------------------ 1 file changed, 18 insertions(+), 36 deletions(-) diff --git a/parquet/src/encodings/decoding/alp.rs b/parquet/src/encodings/decoding/alp.rs index 94b53ca03474..d69496688fda 100644 --- a/parquet/src/encodings/decoding/alp.rs +++ b/parquet/src/encodings/decoding/alp.rs @@ -133,10 +133,10 @@ struct AlpEncodedVectorView { /// Parsed ALP page layout for one exact integer width (`u32` for float pages, /// `u64` for double pages). #[derive(Debug)] -struct AlpPageLayout { +struct AlpPageLayout { header: AlpHeader, body: Bytes, - offsets: Vec, + vectors: Vec>, } /// Exact integer type used by FOR reconstruction. @@ -318,7 +318,7 @@ impl AlpFloat for f64 { /// - non-negative `num_elements` /// - offsets bounds + monotonicity /// - per-vector metadata/data section lengths -fn parse_alp_page_layout(data: Bytes) -> Result { +fn parse_alp_page_layout(data: Bytes) -> Result> { let data_ref = data.as_ref(); if data_ref.len() < ALP_HEADER_SIZE { return Err(general_err!( @@ -424,6 +424,7 @@ fn parse_alp_page_layout(data: Bytes) -> Result offsets.push(offset); } + let mut vectors = Vec::with_capacity(num_vectors); for (vector_idx, vector_offset) in offsets.iter().enumerate() { let vector_start = *vector_offset as usize; let vector_end = if vector_idx + 1 < offsets.len() { @@ -440,13 +441,15 @@ fn parse_alp_page_layout(data: Bytes) -> Result } let vector_num_elements = header.vector_num_elements(vector_idx); - parse_vector_view::(body_ref, vector_start, vector_end, vector_num_elements)?; + let vector = + parse_vector_view::(body_ref, vector_start, vector_end, vector_num_elements)?; + vectors.push(vector); } Ok(AlpPageLayout { header, body, - offsets, + vectors, }) } @@ -663,7 +666,7 @@ fn decode_vector_values( Ok(out) } -fn decode_page_values(layout: &AlpPageLayout) -> Result> { +fn decode_page_values(layout: &AlpPageLayout) -> Result> { let total = layout.header.num_elements_usize(); let mut out = vec![Value::default(); total]; decode_page_values_into::(layout, &mut out)?; @@ -672,10 +675,9 @@ fn decode_page_values(layout: &AlpPageLayout) -> Result( - layout: &AlpPageLayout, + layout: &AlpPageLayout, out: &mut [Value], ) -> Result<()> { let total = layout.header.num_elements_usize(); @@ -688,21 +690,8 @@ fn decode_page_values_into( } let mut output_offset = 0usize; - for (vector_idx, vector_offset) in layout.offsets.iter().enumerate() { - let vector_start = *vector_offset as usize; - let vector_end = if vector_idx + 1 < layout.offsets.len() { - layout.offsets[vector_idx + 1] as usize - } else { - layout.body.len() - }; - let vector_num_elements = layout.header.vector_num_elements(vector_idx); - let vector = parse_vector_view::( - layout.body.as_ref(), - vector_start, - vector_end, - vector_num_elements, - )?; - let vector_values = decode_vector_values::(layout.body.as_ref(), &vector)?; + for vector in &layout.vectors { + let vector_values = decode_vector_values::(layout.body.as_ref(), vector)?; let next_offset = output_offset + vector_values.len(); out[output_offset..next_offset].copy_from_slice(&vector_values); output_offset = next_offset; @@ -729,7 +718,7 @@ where T::T: AlpFloat, ::Exact: Send, { - layout: Option, + layout: Option::Exact>>, decoded_values: Vec, current_offset: usize, needs_decode: bool, @@ -973,7 +962,8 @@ mod tests { let data = make_alp_page_bytes(0, 0, 3, 4, &[4], 13); let parsed = parse_alp_page_layout::(Bytes::from(data)).unwrap(); assert_eq!(parsed.header.num_elements, 4); - assert_eq!(parsed.offsets, vec![4]); + assert_eq!(parsed.vectors.len(), 1); + assert_eq!(parsed.vectors[0].num_elements, 4); } #[test] @@ -1126,16 +1116,8 @@ mod tests { page.extend_from_slice(&vector); let parsed = parse_alp_page_layout::(Bytes::from(page)).unwrap(); - assert_eq!(parsed.offsets, vec![4]); - let vector_start = parsed.offsets[0] as usize; - let vector_end = parsed.body.len(); - let parsed_vector = parse_vector_view::( - parsed.body.as_ref(), - vector_start, - vector_end, - parsed.header.vector_num_elements(0), - ) - .unwrap(); + assert_eq!(parsed.vectors.len(), 1); + let parsed_vector = &parsed.vectors[0]; assert_eq!(parsed_vector.num_elements, 1); assert_eq!(parsed_vector.alp_info.num_exceptions, 1); assert_eq!(parsed_vector.for_info.bit_width, 0); From 07ca6f06471c95a5198e4bf3be06d47fa2b88a4a Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Thu, 12 Mar 2026 16:29:58 -0400 Subject: [PATCH 16/22] use `Bytes` for `packed_values` --- parquet/src/encodings/decoding/alp.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/parquet/src/encodings/decoding/alp.rs b/parquet/src/encodings/decoding/alp.rs index d69496688fda..c56c393696bf 100644 --- a/parquet/src/encodings/decoding/alp.rs +++ b/parquet/src/encodings/decoding/alp.rs @@ -581,7 +581,7 @@ fn parse_vector_view( /// Decode bit-packed deltas into exact integers. fn bit_unpack_integers( - packed_values: &[u8], + packed_values: Bytes, bit_width: u8, num_elements: u16, ) -> Result> { @@ -598,7 +598,7 @@ fn bit_unpack_integers( } let mut out = vec![Exact::zero(); num_elements as usize]; - let mut reader = BitReader::new(Bytes::copy_from_slice(packed_values)); + let mut reader = BitReader::new(packed_values); let read = reader.get_batch::(&mut out, bit_width as usize); if read != out.len() { return Err(general_err!( @@ -621,11 +621,11 @@ fn inverse_for(deltas: &mut [Exact], frame_of_reference: Exact) /// Decode one vector into output floating values: /// bit-unpack -> inverse FOR -> decimal decode -> patch exceptions. fn decode_vector_values( - body: &[u8], + body: &Bytes, vector: &AlpEncodedVectorView, ) -> Result> { let mut exact_values = bit_unpack_integers( - &body[vector.packed_values.clone()], + body.slice(vector.packed_values.clone()), vector.for_info.bit_width, vector.num_elements, )?; @@ -691,7 +691,7 @@ fn decode_page_values_into( let mut output_offset = 0usize; for vector in &layout.vectors { - let vector_values = decode_vector_values::(layout.body.as_ref(), vector)?; + let vector_values = decode_vector_values::(&layout.body, vector)?; let next_offset = output_offset + vector_values.len(); out[output_offset..next_offset].copy_from_slice(&vector_values); output_offset = next_offset; @@ -1127,13 +1127,13 @@ mod tests { #[test] fn test_bit_unpack_integers_width_zero() { - let unpacked = bit_unpack_integers::(&[], 0, 3).unwrap(); + let unpacked = bit_unpack_integers::(Bytes::new(), 0, 3).unwrap(); assert_eq!(unpacked, vec![0, 0, 0]); } #[test] fn test_bit_unpack_integers_width_two() { - let unpacked = bit_unpack_integers::(&[0b0010_0111], 2, 3).unwrap(); + let unpacked = bit_unpack_integers::(Bytes::from_static(&[0b0010_0111]), 2, 3).unwrap(); assert_eq!(unpacked, vec![3, 1, 2]); } From def5c095866d3d687708817e97333985bfd34dba Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Thu, 12 Mar 2026 16:33:52 -0400 Subject: [PATCH 17/22] clippy too many args --- parquet/src/encodings/decoding/alp.rs | 223 +++++++++++++++++++------- 1 file changed, 166 insertions(+), 57 deletions(-) diff --git a/parquet/src/encodings/decoding/alp.rs b/parquet/src/encodings/decoding/alp.rs index c56c393696bf..6740d34aed2a 100644 --- a/parquet/src/encodings/decoding/alp.rs +++ b/parquet/src/encodings/decoding/alp.rs @@ -871,60 +871,49 @@ mod tests { out } - fn make_vector_u32( - exponent: u8, - factor: u8, - num_exceptions: u16, - frame_of_reference: u32, - bit_width: u8, - packed_values: &[u8], - exception_positions: &[u16], - exception_values: &[u32], - ) -> Vec { - assert_eq!(num_exceptions as usize, exception_positions.len()); - assert_eq!(num_exceptions as usize, exception_values.len()); + trait AppendLeBytes { + fn append_le_bytes(self, out: &mut Vec); + } - let mut out = Vec::new(); - out.push(exponent); - out.push(factor); - out.extend_from_slice(&num_exceptions.to_le_bytes()); - out.extend_from_slice(&frame_of_reference.to_le_bytes()); - out.push(bit_width); - out.extend_from_slice(packed_values); - for position in exception_positions { - out.extend_from_slice(&position.to_le_bytes()); + impl AppendLeBytes for u32 { + fn append_le_bytes(self, out: &mut Vec) { + out.extend_from_slice(&self.to_le_bytes()); } - for value in exception_values { - out.extend_from_slice(&value.to_le_bytes()); + } + + impl AppendLeBytes for u64 { + fn append_le_bytes(self, out: &mut Vec) { + out.extend_from_slice(&self.to_le_bytes()); } - out } - fn make_vector_u64( + struct VectorSpec<'a, Exact> { exponent: u8, factor: u8, - num_exceptions: u16, - frame_of_reference: u64, + frame_of_reference: Exact, bit_width: u8, - packed_values: &[u8], - exception_positions: &[u16], - exception_values: &[u64], - ) -> Vec { - assert_eq!(num_exceptions as usize, exception_positions.len()); - assert_eq!(num_exceptions as usize, exception_values.len()); + packed_values: &'a [u8], + exception_positions: &'a [u16], + exception_values: &'a [Exact], + } + + fn make_vector(spec: VectorSpec<'_, Exact>) -> Vec { + let num_exceptions = spec.exception_positions.len(); + assert_eq!(num_exceptions, spec.exception_values.len()); + assert!(u16::try_from(num_exceptions).is_ok()); let mut out = Vec::new(); - out.push(exponent); - out.push(factor); - out.extend_from_slice(&num_exceptions.to_le_bytes()); - out.extend_from_slice(&frame_of_reference.to_le_bytes()); - out.push(bit_width); - out.extend_from_slice(packed_values); - for position in exception_positions { + out.push(spec.exponent); + out.push(spec.factor); + out.extend_from_slice(&(num_exceptions as u16).to_le_bytes()); + spec.frame_of_reference.append_le_bytes(&mut out); + out.push(spec.bit_width); + out.extend_from_slice(spec.packed_values); + for position in spec.exception_positions { out.extend_from_slice(&position.to_le_bytes()); } - for value in exception_values { - out.extend_from_slice(&value.to_le_bytes()); + for value in spec.exception_values { + value.append_le_bytes(&mut out); } out } @@ -1017,7 +1006,15 @@ mod tests { #[test] fn test_parse_alp_page_layout_invalid_exponent_f32() { - let vector = make_vector_u32(11, 0, 0, 0, 0, &[], &[], &[]); + let vector = make_vector(VectorSpec { + exponent: 11, + factor: 0, + frame_of_reference: 0u32, + bit_width: 0, + packed_values: &[], + exception_positions: &[], + exception_values: &[], + }); let page = make_page_from_vectors(3, 1, &[vector]); let err = parse_alp_page_layout::(Bytes::from(page)).unwrap_err(); assert!( @@ -1028,7 +1025,15 @@ mod tests { #[test] fn test_parse_alp_page_layout_invalid_factor_f32() { - let vector = make_vector_u32(0, 11, 0, 0, 0, &[], &[], &[]); + let vector = make_vector(VectorSpec { + exponent: 0, + factor: 11, + frame_of_reference: 0u32, + bit_width: 0, + packed_values: &[], + exception_positions: &[], + exception_values: &[], + }); let page = make_page_from_vectors(3, 1, &[vector]); let err = parse_alp_page_layout::(Bytes::from(page)).unwrap_err(); assert!( @@ -1039,7 +1044,15 @@ mod tests { #[test] fn test_parse_alp_page_layout_factor_exceeds_exponent() { - let vector = make_vector_u32(2, 3, 0, 0, 0, &[], &[], &[]); + let vector = make_vector(VectorSpec { + exponent: 2, + factor: 3, + frame_of_reference: 0u32, + bit_width: 0, + packed_values: &[], + exception_positions: &[], + exception_values: &[], + }); let page = make_page_from_vectors(3, 1, &[vector]); let err = parse_alp_page_layout::(Bytes::from(page)).unwrap_err(); assert!( @@ -1050,7 +1063,15 @@ mod tests { #[test] fn test_parse_alp_page_layout_invalid_num_exceptions() { - let vector = make_vector_u32(0, 0, 2, 0, 0, &[], &[0, 0], &[0, 0]); + let vector = make_vector(VectorSpec { + exponent: 0, + factor: 0, + frame_of_reference: 0u32, + bit_width: 0, + packed_values: &[], + exception_positions: &[0, 0], + exception_values: &[0, 0], + }); let page = make_page_from_vectors(3, 1, &[vector]); let err = parse_alp_page_layout::(Bytes::from(page)).unwrap_err(); assert!( @@ -1061,7 +1082,15 @@ mod tests { #[test] fn test_parse_alp_page_layout_invalid_exception_position() { - let vector = make_vector_u32(0, 0, 1, 0, 0, &[], &[1], &[123]); + let vector = make_vector(VectorSpec { + exponent: 0, + factor: 0, + frame_of_reference: 0u32, + bit_width: 0, + packed_values: &[], + exception_positions: &[1], + exception_values: &[123], + }); let page = make_page_from_vectors(3, 1, &[vector]); let err = parse_alp_page_layout::(Bytes::from(page)).unwrap_err(); assert!( @@ -1146,7 +1175,15 @@ mod tests { #[test] fn test_decode_page_values_f32_no_exceptions() { - let vector = make_vector_u32(0, 0, 0, 10, 2, &[0b1110_0100], &[], &[]); + let vector = make_vector(VectorSpec { + exponent: 0, + factor: 0, + frame_of_reference: 10u32, + bit_width: 2, + packed_values: &[0b1110_0100], + exception_positions: &[], + exception_values: &[], + }); let page = make_page_from_vectors(3, 4, &[vector]); let layout = parse_alp_page_layout::(Bytes::from(page)).unwrap(); let decoded = decode_page_values::(&layout).unwrap(); @@ -1155,8 +1192,24 @@ mod tests { #[test] fn test_decode_page_values_f64_multi_vector_with_exceptions() { - let vector0 = make_vector_u64(0, 0, 1, 10, 1, &[0b0000_0010], &[1], &[42.5f64.to_bits()]); - let vector1 = make_vector_u64(0, 0, 0, 7, 0, &[], &[], &[]); + let vector0 = make_vector(VectorSpec { + exponent: 0, + factor: 0, + frame_of_reference: 10u64, + bit_width: 1, + packed_values: &[0b0000_0010], + exception_positions: &[1], + exception_values: &[42.5f64.to_bits()], + }); + let vector1 = make_vector(VectorSpec { + exponent: 0, + factor: 0, + frame_of_reference: 7u64, + bit_width: 0, + packed_values: &[], + exception_positions: &[], + exception_values: &[], + }); let page = make_page_from_vectors(3, 9, &[vector0, vector1]); let layout = parse_alp_page_layout::(Bytes::from(page)).unwrap(); let decoded = decode_page_values::(&layout).unwrap(); @@ -1173,7 +1226,15 @@ mod tests { (-0.0f32).to_bits(), f32::INFINITY.to_bits(), ]; - let vector = make_vector_u32(0, 0, 3, 0, 0, &[], &[0, 1, 2], &edge_values); + let vector = make_vector(VectorSpec { + exponent: 0, + factor: 0, + frame_of_reference: 0u32, + bit_width: 0, + packed_values: &[], + exception_positions: &[0, 1, 2], + exception_values: &edge_values, + }); let page = make_page_from_vectors(3, 3, &[vector]); let layout = parse_alp_page_layout::(Bytes::from(page)).unwrap(); let decoded = decode_page_values::(&layout).unwrap(); @@ -1186,8 +1247,24 @@ mod tests { #[test] fn test_alp_decoder_get_across_vectors() { - let vector0 = make_vector_u32(0, 0, 0, 10, 1, &[0b0000_0010], &[], &[]); - let vector1 = make_vector_u32(0, 0, 0, 20, 1, &[0b0000_0010], &[], &[]); + let vector0 = make_vector(VectorSpec { + exponent: 0, + factor: 0, + frame_of_reference: 10u32, + bit_width: 1, + packed_values: &[0b0000_0010], + exception_positions: &[], + exception_values: &[], + }); + let vector1 = make_vector(VectorSpec { + exponent: 0, + factor: 0, + frame_of_reference: 20u32, + bit_width: 1, + packed_values: &[0b0000_0010], + exception_positions: &[], + exception_values: &[], + }); let page = make_page_from_vectors(3, 12, &[vector0, vector1]); let mut decoder = AlpDecoder::::new(); @@ -1211,8 +1288,24 @@ mod tests { #[test] fn test_alp_decoder_skip_across_vectors() { - let vector0 = make_vector_u32(0, 0, 0, 10, 1, &[0b0000_0010], &[], &[]); - let vector1 = make_vector_u32(0, 0, 0, 20, 1, &[0b0000_0010], &[], &[]); + let vector0 = make_vector(VectorSpec { + exponent: 0, + factor: 0, + frame_of_reference: 10u32, + bit_width: 1, + packed_values: &[0b0000_0010], + exception_positions: &[], + exception_values: &[], + }); + let vector1 = make_vector(VectorSpec { + exponent: 0, + factor: 0, + frame_of_reference: 20u32, + bit_width: 1, + packed_values: &[0b0000_0010], + exception_positions: &[], + exception_values: &[], + }); let page = make_page_from_vectors(3, 12, &[vector0, vector1]); let mut decoder = AlpDecoder::::new(); @@ -1231,8 +1324,24 @@ mod tests { #[test] fn test_alp_decoder_get_fast_path_full_read() { - let vector0 = make_vector_u32(0, 0, 0, 10, 1, &[0b0000_0010], &[], &[]); - let vector1 = make_vector_u32(0, 0, 0, 20, 1, &[0b0000_0010], &[], &[]); + let vector0 = make_vector(VectorSpec { + exponent: 0, + factor: 0, + frame_of_reference: 10u32, + bit_width: 1, + packed_values: &[0b0000_0010], + exception_positions: &[], + exception_values: &[], + }); + let vector1 = make_vector(VectorSpec { + exponent: 0, + factor: 0, + frame_of_reference: 20u32, + bit_width: 1, + packed_values: &[0b0000_0010], + exception_positions: &[], + exception_values: &[], + }); let page = make_page_from_vectors(3, 12, &[vector0, vector1]); let mut decoder = AlpDecoder::::new(); From 5e5c8d242eb4b801cec96e60a6a543c5a54427b8 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Thu, 12 Mar 2026 19:25:11 -0400 Subject: [PATCH 18/22] cargo fmt --- parquet/src/encodings/decoding.rs | 2 +- parquet/src/encodings/decoding/alp.rs | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index 36c7f6720104..dce631930975 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -26,10 +26,10 @@ use super::rle::RleDecoder; use crate::basic::*; use crate::data_type::private::ParquetValueType; use crate::data_type::*; +use crate::encodings::decoding::alp::AlpDecoder; use crate::encodings::decoding::byte_stream_split_decoder::{ ByteStreamSplitDecoder, VariableWidthByteStreamSplitDecoder, }; -use crate::encodings::decoding::alp::AlpDecoder; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; use crate::util::bit_util::{self, BitReader}; diff --git a/parquet/src/encodings/decoding/alp.rs b/parquet/src/encodings/decoding/alp.rs index 6740d34aed2a..b5f507e341f9 100644 --- a/parquet/src/encodings/decoding/alp.rs +++ b/parquet/src/encodings/decoding/alp.rs @@ -1162,7 +1162,8 @@ mod tests { #[test] fn test_bit_unpack_integers_width_two() { - let unpacked = bit_unpack_integers::(Bytes::from_static(&[0b0010_0111]), 2, 3).unwrap(); + let unpacked = + bit_unpack_integers::(Bytes::from_static(&[0b0010_0111]), 2, 3).unwrap(); assert_eq!(unpacked, vec![3, 1, 2]); } From 5ed70ae96a4e2126dda1104450bca1cdd9932cd6 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 16 Mar 2026 09:30:13 -0400 Subject: [PATCH 19/22] fix decode to two-step multiplication --- parquet/src/encodings/decoding/alp.rs | 134 +++++++++++++++++++------- 1 file changed, 101 insertions(+), 33 deletions(-) diff --git a/parquet/src/encodings/decoding/alp.rs b/parquet/src/encodings/decoding/alp.rs index b5f507e341f9..f444d47611cb 100644 --- a/parquet/src/encodings/decoding/alp.rs +++ b/parquet/src/encodings/decoding/alp.rs @@ -202,26 +202,40 @@ impl AlpExact for u64 { } } -const ALP_I64_POW10: [i64; 19] = [ - 1, - 10, - 100, - 1_000, - 10_000, - 100_000, - 1_000_000, - 10_000_000, - 100_000_000, - 1_000_000_000, - 10_000_000_000, - 100_000_000_000, - 1_000_000_000_000, - 10_000_000_000_000, - 100_000_000_000_000, - 1_000_000_000_000_000, - 10_000_000_000_000_000, - 100_000_000_000_000_000, - 1_000_000_000_000_000_000, +const ALP_POW10_F32: [f32; 11] = [ + 1.0, + 10.0, + 100.0, + 1000.0, + 10000.0, + 100000.0, + 1000000.0, + 10000000.0, + 100000000.0, + 1000000000.0, + 10000000000.0, +]; + +const ALP_POW10_F64: [f64; 19] = [ + 1.0, + 10.0, + 100.0, + 1000.0, + 10000.0, + 100000.0, + 1000000.0, + 10000000.0, + 100000000.0, + 1000000000.0, + 10000000000.0, + 100000000000.0, + 1000000000000.0, + 10000000000000.0, + 100000000000000.0, + 1000000000000000.0, + 10000000000000000.0, + 100000000000000000.0, + 1000000000000000000.0, ]; const ALP_NEG_POW10_F32: [f32; 11] = [ @@ -262,30 +276,35 @@ const ALP_NEG_POW10_F64: [f64; 19] = [ pub(super) trait AlpFloat: Copy + Default { type Exact: AlpExact + FromBytes; + type Scale: Copy; - /// Precompute vector-level ALP decimal scale for: - /// `value = encoded * 10^(factor) * 10^(-exponent)`. + /// Precompute vector-level ALP decimal scale constants for: + /// `value = (encoded * 10^(factor)) * 10^(-exponent)`. /// /// Preconditions are validated during page parse. - fn decode_scale(exponent: u8, factor: u8) -> Self; + fn decode_scale(exponent: u8, factor: u8) -> Self::Scale; - /// Decode one signed exact integer using a precomputed scale. - fn decode_value(signed_encoded: ::Signed, scale: Self) -> Self; + /// Decode one signed exact integer using a precomputed two-step scale. + fn decode_value( + signed_encoded: ::Signed, + scale: Self::Scale, + ) -> Self; fn from_exact_bits(bits: Self::Exact) -> Self; } impl AlpFloat for f32 { type Exact = u32; + type Scale = (f32, f32); - fn decode_scale(exponent: u8, factor: u8) -> Self { + fn decode_scale(exponent: u8, factor: u8) -> Self::Scale { debug_assert!(exponent <= ALP_MAX_EXPONENT_F32); debug_assert!(factor <= exponent); - (ALP_I64_POW10[factor as usize] as f32) * ALP_NEG_POW10_F32[exponent as usize] + (ALP_POW10_F32[factor as usize], ALP_NEG_POW10_F32[exponent as usize]) } - fn decode_value(signed_encoded: i32, scale: Self) -> Self { - (signed_encoded as f32) * scale + fn decode_value(signed_encoded: i32, scale: Self::Scale) -> Self { + ((signed_encoded as f32) * scale.0) * scale.1 } fn from_exact_bits(bits: Self::Exact) -> Self { @@ -295,15 +314,16 @@ impl AlpFloat for f32 { impl AlpFloat for f64 { type Exact = u64; + type Scale = (f64, f64); - fn decode_scale(exponent: u8, factor: u8) -> Self { + fn decode_scale(exponent: u8, factor: u8) -> Self::Scale { debug_assert!(exponent <= ALP_MAX_EXPONENT_F64); debug_assert!(factor <= exponent); - (ALP_I64_POW10[factor as usize] as f64) * ALP_NEG_POW10_F64[exponent as usize] + (ALP_POW10_F64[factor as usize], ALP_NEG_POW10_F64[exponent as usize]) } - fn decode_value(signed_encoded: i64, scale: Self) -> Self { - (signed_encoded as f64) * scale + fn decode_value(signed_encoded: i64, scale: Self::Scale) -> Self { + ((signed_encoded as f64) * scale.0) * scale.1 } fn from_exact_bits(bits: Self::Exact) -> Self { @@ -1246,6 +1266,54 @@ mod tests { assert_eq!(decoded[2], f32::INFINITY); } + #[test] + fn test_decode_page_values_f32_two_step_decimal_multiply() { + let encoded = 1_970_570_984_i32; + let vector = make_vector(VectorSpec { + exponent: 1, + factor: 1, + frame_of_reference: encoded as u32, + bit_width: 0, + packed_values: &[], + exception_positions: &[], + exception_values: &[], + }); + let page = make_page_from_vectors(3, 1, &[vector]); + let layout = parse_alp_page_layout::(Bytes::from(page)).unwrap(); + let decoded = decode_page_values::(&layout).unwrap(); + + let expected_two_step = ((encoded as f32) * ALP_POW10_F32[1]) * ALP_NEG_POW10_F32[1]; + let one_step_scale = ALP_POW10_F32[1] * ALP_NEG_POW10_F32[1]; + let expected_one_step = (encoded as f32) * one_step_scale; + + assert_eq!(decoded[0].to_bits(), expected_two_step.to_bits()); + assert_ne!(decoded[0].to_bits(), expected_one_step.to_bits()); + } + + #[test] + fn test_decode_page_values_f64_two_step_decimal_multiply() { + let encoded = -3_900_047_474_048_127_703_i64; + let vector = make_vector(VectorSpec { + exponent: 1, + factor: 1, + frame_of_reference: encoded as u64, + bit_width: 0, + packed_values: &[], + exception_positions: &[], + exception_values: &[], + }); + let page = make_page_from_vectors(3, 1, &[vector]); + let layout = parse_alp_page_layout::(Bytes::from(page)).unwrap(); + let decoded = decode_page_values::(&layout).unwrap(); + + let expected_two_step = ((encoded as f64) * ALP_POW10_F64[1]) * ALP_NEG_POW10_F64[1]; + let one_step_scale = ALP_POW10_F64[1] * ALP_NEG_POW10_F64[1]; + let expected_one_step = (encoded as f64) * one_step_scale; + + assert_eq!(decoded[0].to_bits(), expected_two_step.to_bits()); + assert_ne!(decoded[0].to_bits(), expected_one_step.to_bits()); + } + #[test] fn test_alp_decoder_get_across_vectors() { let vector0 = make_vector(VectorSpec { From e328d7a8e903e4bc5321b9d74ee9194a83e956c9 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 16 Mar 2026 09:47:33 -0400 Subject: [PATCH 20/22] make vector byte count more strict --- parquet/src/encodings/decoding/alp.rs | 86 ++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 3 deletions(-) diff --git a/parquet/src/encodings/decoding/alp.rs b/parquet/src/encodings/decoding/alp.rs index f444d47611cb..b34840b29db7 100644 --- a/parquet/src/encodings/decoding/alp.rs +++ b/parquet/src/encodings/decoding/alp.rs @@ -130,6 +130,16 @@ struct AlpEncodedVectorView { exception_values: Vec, } +impl AlpEncodedVectorView { + fn expected_stored_size(&self) -> usize { + AlpEncodedVectorInfo::STORED_SIZE + + AlpEncodedForVectorInfo::::stored_size() + + self + .for_info + .get_data_stored_size(self.num_elements, self.alp_info.num_exceptions) + } +} + /// Parsed ALP page layout for one exact integer width (`u32` for float pages, /// `u64` for double pages). #[derive(Debug)] @@ -445,6 +455,7 @@ fn parse_alp_page_layout(data: Bytes) -> Result(data: Bytes) -> Result(body_ref, vector_start, vector_end, vector_num_elements)?; + expected_vectors_size = expected_vectors_size + .checked_add(vector.expected_stored_size()) + .ok_or_else(|| general_err!("Invalid ALP page: expected vectors size overflow"))?; vectors.push(vector); } + let expected_body_len = offsets_section_size + .checked_add(expected_vectors_size) + .ok_or_else(|| general_err!("Invalid ALP page: expected body size overflow"))?; + if body_len != expected_body_len { + return Err(general_err!( + "Invalid ALP page: body size {} does not match expected {} (offsets + vectors)", + body_len, + expected_body_len + )); + } + Ok(AlpPageLayout { header, body, @@ -548,15 +573,23 @@ fn parse_vector_view( }; let data_size = for_info.get_data_stored_size(num_elements, alp_info.num_exceptions); - if vector_bytes.len() < metadata_size + data_size { + let expected_size = metadata_size + data_size; + if vector_bytes.len() < expected_size { return Err(general_err!( "Invalid ALP page: vector data too short, expected at least {} bytes, got {}", - metadata_size + data_size, + expected_size, + vector_bytes.len() + )); + } + if vector_bytes.len() > expected_size { + return Err(general_err!( + "Invalid ALP page: vector data too long, expected {} bytes, got {}", + expected_size, vector_bytes.len() )); } - let data = &vector_bytes[metadata_size..metadata_size + data_size]; + let data = &vector_bytes[metadata_size..expected_size]; let packed_size = for_info.get_bit_packed_size(num_elements); let positions_size = alp_info.num_exceptions as usize * std::mem::size_of::(); let values_size = alp_info.num_exceptions as usize * Exact::WIDTH; @@ -1146,6 +1179,53 @@ mod tests { ); } + #[test] + fn test_parse_alp_page_layout_vector_data_too_long() { + let mut vector = make_vector(VectorSpec { + exponent: 0, + factor: 0, + frame_of_reference: 0u32, + bit_width: 0, + packed_values: &[], + exception_positions: &[], + exception_values: &[], + }); + vector.push(0xAB); + + let page = make_page_from_vectors(3, 1, &[vector]); + let err = parse_alp_page_layout::(Bytes::from(page)).unwrap_err(); + assert!( + err.to_string() + .contains("Invalid ALP page: vector data too long, expected 9 bytes, got 10") + ); + } + + #[test] + fn test_parse_alp_page_layout_body_size_mismatch_unclaimed_bytes() { + let vector = make_vector(VectorSpec { + exponent: 0, + factor: 0, + frame_of_reference: 0u32, + bit_width: 0, + packed_values: &[], + exception_positions: &[], + exception_values: &[], + }); + + // offsets section is 4 bytes for one vector, so offset=5 leaves one + // unclaimed byte between offsets and vector data. + let mut page = make_alp_page_bytes(0, 0, 3, 1, &[5], 0); + page.push(0); + page.extend_from_slice(&vector); + + let err = parse_alp_page_layout::(Bytes::from(page)).unwrap_err(); + assert!( + err.to_string().contains( + "Invalid ALP page: body size 14 does not match expected 13 (offsets + vectors)" + ) + ); + } + #[test] fn test_parse_alp_page_layout_parses_vector_view_data_only_f64() { let mut vector = Vec::new(); From 1807d32ea0627cde358af742cd07436e830683aa Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 16 Mar 2026 09:56:14 -0400 Subject: [PATCH 21/22] add stricter offset validation --- parquet/src/encodings/decoding/alp.rs | 29 ++++++++++++++++----------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/parquet/src/encodings/decoding/alp.rs b/parquet/src/encodings/decoding/alp.rs index b34840b29db7..5c1332629a3b 100644 --- a/parquet/src/encodings/decoding/alp.rs +++ b/parquet/src/encodings/decoding/alp.rs @@ -455,9 +455,18 @@ fn parse_alp_page_layout(data: Bytes) -> Result(data: Bytes) -> Result(body_ref, vector_start, vector_end, vector_num_elements)?; - expected_vectors_size = expected_vectors_size + expected_next_offset = vector_start .checked_add(vector.expected_stored_size()) - .ok_or_else(|| general_err!("Invalid ALP page: expected vectors size overflow"))?; + .ok_or_else(|| general_err!("Invalid ALP page: expected next vector offset overflow"))?; vectors.push(vector); } - let expected_body_len = offsets_section_size - .checked_add(expected_vectors_size) - .ok_or_else(|| general_err!("Invalid ALP page: expected body size overflow"))?; - if body_len != expected_body_len { + if expected_next_offset != body_len { return Err(general_err!( "Invalid ALP page: body size {} does not match expected {} (offsets + vectors)", body_len, - expected_body_len + expected_next_offset )); } @@ -1159,7 +1165,7 @@ mod tests { let err = parse_alp_page_layout::(Bytes::from(data)).unwrap_err(); assert!( err.to_string() - .contains("Invalid ALP page: vector offsets are not monotonic at index 0") + .contains("Invalid ALP page: vector offset 12 at index 0 does not match expected 8") ); } @@ -1220,9 +1226,8 @@ mod tests { let err = parse_alp_page_layout::(Bytes::from(page)).unwrap_err(); assert!( - err.to_string().contains( - "Invalid ALP page: body size 14 does not match expected 13 (offsets + vectors)" - ) + err.to_string() + .contains("Invalid ALP page: vector offset 5 at index 0 does not match expected 4") ); } From 11c79057a5c69e9235fb8769578fa71c95c1ea5e Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 16 Mar 2026 10:18:25 -0400 Subject: [PATCH 22/22] fmt --- parquet/src/encodings/decoding/alp.rs | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/parquet/src/encodings/decoding/alp.rs b/parquet/src/encodings/decoding/alp.rs index 5c1332629a3b..eb76a27536ce 100644 --- a/parquet/src/encodings/decoding/alp.rs +++ b/parquet/src/encodings/decoding/alp.rs @@ -295,10 +295,7 @@ pub(super) trait AlpFloat: Copy + Default { fn decode_scale(exponent: u8, factor: u8) -> Self::Scale; /// Decode one signed exact integer using a precomputed two-step scale. - fn decode_value( - signed_encoded: ::Signed, - scale: Self::Scale, - ) -> Self; + fn decode_value(signed_encoded: ::Signed, scale: Self::Scale) -> Self; fn from_exact_bits(bits: Self::Exact) -> Self; } @@ -310,7 +307,10 @@ impl AlpFloat for f32 { fn decode_scale(exponent: u8, factor: u8) -> Self::Scale { debug_assert!(exponent <= ALP_MAX_EXPONENT_F32); debug_assert!(factor <= exponent); - (ALP_POW10_F32[factor as usize], ALP_NEG_POW10_F32[exponent as usize]) + ( + ALP_POW10_F32[factor as usize], + ALP_NEG_POW10_F32[exponent as usize], + ) } fn decode_value(signed_encoded: i32, scale: Self::Scale) -> Self { @@ -329,7 +329,10 @@ impl AlpFloat for f64 { fn decode_scale(exponent: u8, factor: u8) -> Self::Scale { debug_assert!(exponent <= ALP_MAX_EXPONENT_F64); debug_assert!(factor <= exponent); - (ALP_POW10_F64[factor as usize], ALP_NEG_POW10_F64[exponent as usize]) + ( + ALP_POW10_F64[factor as usize], + ALP_NEG_POW10_F64[exponent as usize], + ) } fn decode_value(signed_encoded: i64, scale: Self::Scale) -> Self { @@ -485,7 +488,9 @@ fn parse_alp_page_layout(data: Bytes) -> Result(body_ref, vector_start, vector_end, vector_num_elements)?; expected_next_offset = vector_start .checked_add(vector.expected_stored_size()) - .ok_or_else(|| general_err!("Invalid ALP page: expected next vector offset overflow"))?; + .ok_or_else(|| { + general_err!("Invalid ALP page: expected next vector offset overflow") + })?; vectors.push(vector); } @@ -1164,8 +1169,9 @@ mod tests { let data = make_alp_page_bytes(0, 0, 3, 9, &[12, 8], 12); let err = parse_alp_page_layout::(Bytes::from(data)).unwrap_err(); assert!( - err.to_string() - .contains("Invalid ALP page: vector offset 12 at index 0 does not match expected 8") + err.to_string().contains( + "Invalid ALP page: vector offset 12 at index 0 does not match expected 8" + ) ); }