From 6f8ea5173fdd1a6f562231f94c3b69330b781b12 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 22:00:44 +0000 Subject: [PATCH 01/10] vortex-row: crate scaffolding Add an empty `vortex-row` crate with a minimal `initialize` stub so the following commits can layer in the row-encoder, codec, scalar functions, and per-encoding kernels without touching the workspace skeleton each time. The crate is wired into the workspace members list and workspace dependency table; `public-api.lock` is generated against the stub. Signed-off-by: Claude --- Cargo.lock | 7 +++++++ Cargo.toml | 2 ++ vortex-row/Cargo.toml | 20 ++++++++++++++++++++ vortex-row/public-api.lock | 3 +++ vortex-row/src/lib.rs | 14 ++++++++++++++ 5 files changed, 46 insertions(+) create mode 100644 vortex-row/Cargo.toml create mode 100644 vortex-row/public-api.lock create mode 100644 vortex-row/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 2819f2bacd0..63a608277d3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11034,6 +11034,13 @@ dependencies = [ "vortex-tui", ] +[[package]] +name = "vortex-row" +version = "0.1.0" +dependencies = [ + "vortex-session", +] + [[package]] name = "vortex-runend" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index fb87a953154..9fae5b564bf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ members = [ "vortex-mask", "vortex-utils", "vortex-session", + "vortex-row", "vortex-flatbuffers", "vortex-metrics", "vortex-io", @@ -291,6 +292,7 @@ vortex-mask = { version = "0.1.0", path = "./vortex-mask", default-features = fa vortex-metrics = { version = "0.1.0", path = "./vortex-metrics", default-features = false } vortex-pco = { version = "0.1.0", path = "./encodings/pco", default-features = false } vortex-proto = { version = "0.1.0", path = "./vortex-proto", default-features = false } +vortex-row = { version = "0.1.0", path = "./vortex-row", default-features = false } vortex-runend = { version = "0.1.0", path = "./encodings/runend", default-features = false } vortex-scan = { version = "0.1.0", path = "./vortex-scan", default-features = false } vortex-sequence = { version = "0.1.0", path = "encodings/sequence", default-features = false } diff --git a/vortex-row/Cargo.toml b/vortex-row/Cargo.toml new file mode 100644 index 00000000000..7515715392c --- /dev/null +++ b/vortex-row/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "vortex-row" +authors = { workspace = true } +categories = { workspace = true } +description = "Row-oriented byte encoder for Vortex arrays, analogous to arrow-row." +edition = { workspace = true } +homepage = { workspace = true } +include = { workspace = true } +keywords = { workspace = true } +license = { workspace = true } +readme = { workspace = true } +repository = { workspace = true } +rust-version = { workspace = true } +version = { workspace = true } + +[lints] +workspace = true + +[dependencies] +vortex-session = { workspace = true } diff --git a/vortex-row/public-api.lock b/vortex-row/public-api.lock new file mode 100644 index 00000000000..d507aa46a00 --- /dev/null +++ b/vortex-row/public-api.lock @@ -0,0 +1,3 @@ +pub mod vortex_row + +pub fn vortex_row::initialize(&vortex_session::VortexSession) diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs new file mode 100644 index 00000000000..f675ca12f4d --- /dev/null +++ b/vortex-row/src/lib.rs @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Row-oriented byte encoder, analogous to Apache Arrow's `arrow-row` crate. +//! +//! Subsequent commits add the encoder, decoder helpers, and per-encoding fast paths. +//! This commit only establishes the crate skeleton and an `initialize` stub. + +use vortex_session::VortexSession; + +/// Register the row-encoding scalar functions on the given session. +/// +/// Currently a stub: subsequent commits register `RowSize` and `RowEncode` here. +pub fn initialize(_session: &VortexSession) {} From 4f4aca5f32e93729f1cc6acf6767901e47fcf4aa Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 22:01:35 +0000 Subject: [PATCH 02/10] vortex-row: add SortField and RowEncodeOptions Introduce the per-column sort-field options and the variadic-function options struct used by the upcoming RowSize / RowEncode scalar functions. `RowEncodeOptions::fields` uses a `SmallVec<[SortField; 4]>` so typical 1-4 column keys avoid a heap allocation. Includes a compact serialize / deserialize helper used later by the scalar-function metadata round-trip. Signed-off-by: Claude --- Cargo.lock | 2 + vortex-row/Cargo.toml | 2 + vortex-row/public-api.lock | 156 ++++++++++++++++++++++++++++++++++++ vortex-row/src/lib.rs | 4 + vortex-row/src/options.rs | 157 +++++++++++++++++++++++++++++++++++++ 5 files changed, 321 insertions(+) create mode 100644 vortex-row/src/options.rs diff --git a/Cargo.lock b/Cargo.lock index 63a608277d3..ce2b4e6d41b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11038,6 +11038,8 @@ dependencies = [ name = "vortex-row" version = "0.1.0" dependencies = [ + "smallvec", + "vortex-error", "vortex-session", ] diff --git a/vortex-row/Cargo.toml b/vortex-row/Cargo.toml index 7515715392c..3e314fd7697 100644 --- a/vortex-row/Cargo.toml +++ b/vortex-row/Cargo.toml @@ -17,4 +17,6 @@ version = { workspace = true } workspace = true [dependencies] +smallvec = { workspace = true } +vortex-error = { workspace = true } vortex-session = { workspace = true } diff --git a/vortex-row/public-api.lock b/vortex-row/public-api.lock index d507aa46a00..998a7712f2d 100644 --- a/vortex-row/public-api.lock +++ b/vortex-row/public-api.lock @@ -1,3 +1,159 @@ pub mod vortex_row +pub mod vortex_row::options + +pub struct vortex_row::options::RowEncodeOptions + +pub vortex_row::options::RowEncodeOptions::fields: smallvec::SmallVec<[vortex_row::options::SortField; 4]> + +impl vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::new(impl core::iter::traits::collect::IntoIterator) -> Self + +impl core::clone::Clone for vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::clone(&self) -> vortex_row::options::RowEncodeOptions + +impl core::cmp::Eq for vortex_row::options::RowEncodeOptions + +impl core::cmp::PartialEq for vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::eq(&self, &vortex_row::options::RowEncodeOptions) -> bool + +impl core::fmt::Debug for vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::fmt::Display for vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::hash::Hash for vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::hash<__H: core::hash::Hasher>(&self, &mut __H) + +impl core::marker::StructuralPartialEq for vortex_row::options::RowEncodeOptions + +pub struct vortex_row::options::SortField + +pub vortex_row::options::SortField::descending: bool + +pub vortex_row::options::SortField::nulls_first: bool + +impl vortex_row::options::SortField + +pub fn vortex_row::options::SortField::new(bool, bool) -> Self + +pub fn vortex_row::options::SortField::non_null_sentinel(&self) -> u8 + +pub fn vortex_row::options::SortField::null_sentinel(&self) -> u8 + +impl core::clone::Clone for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::clone(&self) -> vortex_row::options::SortField + +impl core::cmp::Eq for vortex_row::options::SortField + +impl core::cmp::PartialEq for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::eq(&self, &vortex_row::options::SortField) -> bool + +impl core::default::Default for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::default() -> Self + +impl core::fmt::Debug for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::fmt::Display for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::hash::Hash for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::hash<__H: core::hash::Hasher>(&self, &mut __H) + +impl core::marker::Copy for vortex_row::options::SortField + +impl core::marker::StructuralPartialEq for vortex_row::options::SortField + +pub const vortex_row::options::FIELDS_INLINE: usize + +pub struct vortex_row::RowEncodeOptions + +pub vortex_row::RowEncodeOptions::fields: smallvec::SmallVec<[vortex_row::options::SortField; 4]> + +impl vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::new(impl core::iter::traits::collect::IntoIterator) -> Self + +impl core::clone::Clone for vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::clone(&self) -> vortex_row::options::RowEncodeOptions + +impl core::cmp::Eq for vortex_row::options::RowEncodeOptions + +impl core::cmp::PartialEq for vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::eq(&self, &vortex_row::options::RowEncodeOptions) -> bool + +impl core::fmt::Debug for vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::fmt::Display for vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::hash::Hash for vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::hash<__H: core::hash::Hasher>(&self, &mut __H) + +impl core::marker::StructuralPartialEq for vortex_row::options::RowEncodeOptions + +pub struct vortex_row::SortField + +pub vortex_row::SortField::descending: bool + +pub vortex_row::SortField::nulls_first: bool + +impl vortex_row::options::SortField + +pub fn vortex_row::options::SortField::new(bool, bool) -> Self + +pub fn vortex_row::options::SortField::non_null_sentinel(&self) -> u8 + +pub fn vortex_row::options::SortField::null_sentinel(&self) -> u8 + +impl core::clone::Clone for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::clone(&self) -> vortex_row::options::SortField + +impl core::cmp::Eq for vortex_row::options::SortField + +impl core::cmp::PartialEq for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::eq(&self, &vortex_row::options::SortField) -> bool + +impl core::default::Default for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::default() -> Self + +impl core::fmt::Debug for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::fmt::Display for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::hash::Hash for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::hash<__H: core::hash::Hasher>(&self, &mut __H) + +impl core::marker::Copy for vortex_row::options::SortField + +impl core::marker::StructuralPartialEq for vortex_row::options::SortField + pub fn vortex_row::initialize(&vortex_session::VortexSession) diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs index f675ca12f4d..9e62f25caf2 100644 --- a/vortex-row/src/lib.rs +++ b/vortex-row/src/lib.rs @@ -6,6 +6,10 @@ //! Subsequent commits add the encoder, decoder helpers, and per-encoding fast paths. //! This commit only establishes the crate skeleton and an `initialize` stub. +pub mod options; + +pub use options::RowEncodeOptions; +pub use options::SortField; use vortex_session::VortexSession; /// Register the row-encoding scalar functions on the given session. diff --git a/vortex-row/src/options.rs b/vortex-row/src/options.rs new file mode 100644 index 00000000000..a9e5e2b18ab --- /dev/null +++ b/vortex-row/src/options.rs @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::fmt::Display; +use std::fmt::Formatter; + +use smallvec::SmallVec; + +/// Per-column options for the row-oriented byte encoder. +/// +/// These options control how a single column is encoded into row bytes: +/// - `descending`: if true, the encoded value bytes are bit-inverted so that +/// lexicographic byte comparison reflects the reverse of the natural ordering. +/// The null sentinel byte is NOT inverted, so nulls keep their requested +/// position relative to non-nulls. +/// - `nulls_first`: if true, nulls sort before non-nulls. If false, nulls sort +/// after non-nulls. Implemented via the sentinel byte that precedes every +/// value's encoded bytes. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct SortField { + /// If true, encoded value bytes are bit-inverted so lexicographic byte + /// comparison reflects the reverse of the natural ordering. + pub descending: bool, + /// If true, nulls sort before non-null values; otherwise nulls sort after. + pub nulls_first: bool, +} + +impl Default for SortField { + fn default() -> Self { + Self { + descending: false, + nulls_first: true, + } + } +} + +impl Display for SortField { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "descending={}, nulls_first={}", + self.descending, self.nulls_first + ) + } +} + +impl SortField { + /// Construct a new `SortField` with explicit options. + pub fn new(descending: bool, nulls_first: bool) -> Self { + Self { + descending, + nulls_first, + } + } + + /// Returns the sentinel byte to write for a non-null value. + #[inline] + pub fn non_null_sentinel(&self) -> u8 { + // Non-null is always 0x01. Null choices are < or > 0x01. + 0x01 + } + + /// Returns the sentinel byte to write for a null value. + #[inline] + pub fn null_sentinel(&self) -> u8 { + if self.nulls_first { + // Nulls before non-nulls (smaller byte sorts first). + 0x00 + } else { + // Nulls after non-nulls (larger byte sorts later). + 0x02 + } + } +} + +/// Inline capacity for [`RowEncodeOptions::fields`]. Up to this many [`SortField`]s +/// are held inline without a heap allocation; beyond, the storage spills. +pub const FIELDS_INLINE: usize = 4; + +/// Options for the variadic [`RowSize`] and [`RowEncode`] scalar functions: +/// one [`SortField`] per input column. +/// +/// Stored in a [`SmallVec`] so that typical 1–4 column keys avoid a heap +/// allocation; longer field lists spill to the heap transparently. +/// +/// [`RowSize`]: super::size::RowSize +/// [`RowEncode`]: super::encode::RowEncode +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct RowEncodeOptions { + /// Per-column sort fields, in left-to-right column order. + pub fields: SmallVec<[SortField; FIELDS_INLINE]>, +} + +impl RowEncodeOptions { + /// Construct a new `RowEncodeOptions` from any iterator of [`SortField`]s. + pub fn new(fields: impl IntoIterator) -> Self { + Self { + fields: fields.into_iter().collect(), + } + } +} + +impl Display for RowEncodeOptions { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "[")?; + for (i, field) in self.fields.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{}", field)?; + } + write!(f, "]") + } +} + +/// Serialize a [`RowEncodeOptions`] to a compact byte vector: 4-byte LE length followed by +/// `2 * len` bytes (descending + nulls_first booleans for each field). +pub(crate) fn serialize_row_encode_options(opts: &RowEncodeOptions) -> Vec { + use vortex_error::VortexExpect; + let n = + u32::try_from(opts.fields.len()).vortex_expect("RowEncodeOptions length must fit in u32"); + let mut out = Vec::with_capacity(4 + 2 * opts.fields.len()); + out.extend_from_slice(&n.to_le_bytes()); + for f in &opts.fields { + out.push(u8::from(f.descending)); + out.push(u8::from(f.nulls_first)); + } + out +} + +/// Deserialize a [`RowEncodeOptions`] produced by [`serialize_row_encode_options`]. +pub(crate) fn deserialize_row_encode_options( + bytes: &[u8], +) -> vortex_error::VortexResult { + if bytes.len() < 4 { + vortex_error::vortex_bail!("RowEncodeOptions metadata must contain a 4-byte length prefix"); + } + let n = u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize; + let expected = 4 + 2 * n; + if bytes.len() != expected { + vortex_error::vortex_bail!( + "RowEncodeOptions metadata wrong size: got {}, expected {}", + bytes.len(), + expected + ); + } + let mut fields: SmallVec<[SortField; FIELDS_INLINE]> = SmallVec::with_capacity(n); + let mut i = 4; + for _ in 0..n { + fields.push(SortField { + descending: bytes[i] != 0, + nulls_first: bytes[i + 1] != 0, + }); + i += 2; + } + Ok(RowEncodeOptions { fields }) +} From 1b7af91e7c0b1fa4269b1768809f20c7ff2329b9 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 22:04:03 +0000 Subject: [PATCH 03/10] vortex-row: codec for fixed-width canonical types Add the byte-encoding kernels for the fixed-width portion of the row encoder: Null, Bool, Primitive (12 PTypes), and Decimal (i8..i128). Each encoder writes a 1-byte sentinel followed by the value's row-comparable bytes (sign-flipped big-endian for signed ints, sign-aware mask for floats, etc.). The size pass is a constant `width-per-row` add for these types; the encode pass walks rows and writes into the shared output buffer at `offsets[i] + cursors[i]`. `row_width_for_dtype` classifies the column based purely on its DType. Scalar-level encoders (`encode_scalar_primitive` / `encode_scalar_bool` / `encode_scalar_null` / `encode_scalar` / `encoded_size_for_scalar`) are included for the same fixed-width subset; varlen and nested canonical variants bail with a clear "not yet supported" error and land in follow-up commits. The implementation is deliberately the simplest correct version: bounds-checked array indexing, no `copy_nonoverlapping`, no validity fast-path helper. Subsequent PRs evolve this toward the optimized form. Signed-off-by: Claude --- Cargo.lock | 4 + vortex-row/Cargo.toml | 4 + vortex-row/public-api.lock | 100 ++++++ vortex-row/src/codec.rs | 667 +++++++++++++++++++++++++++++++++++++ vortex-row/src/lib.rs | 1 + 5 files changed, 776 insertions(+) create mode 100644 vortex-row/src/codec.rs diff --git a/Cargo.lock b/Cargo.lock index ce2b4e6d41b..0b84f6dd260 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11038,8 +11038,12 @@ dependencies = [ name = "vortex-row" version = "0.1.0" dependencies = [ + "bytes", "smallvec", + "vortex-array", + "vortex-buffer", "vortex-error", + "vortex-mask", "vortex-session", ] diff --git a/vortex-row/Cargo.toml b/vortex-row/Cargo.toml index 3e314fd7697..aaed9a55f51 100644 --- a/vortex-row/Cargo.toml +++ b/vortex-row/Cargo.toml @@ -17,6 +17,10 @@ version = { workspace = true } workspace = true [dependencies] +bytes = { workspace = true } smallvec = { workspace = true } +vortex-array = { workspace = true } +vortex-buffer = { workspace = true } vortex-error = { workspace = true } +vortex-mask = { workspace = true } vortex-session = { workspace = true } diff --git a/vortex-row/public-api.lock b/vortex-row/public-api.lock index 998a7712f2d..4990e30ba16 100644 --- a/vortex-row/public-api.lock +++ b/vortex-row/public-api.lock @@ -1,5 +1,105 @@ pub mod vortex_row +pub mod vortex_row::codec + +pub enum vortex_row::codec::RowWidth + +pub vortex_row::codec::RowWidth::Fixed(u32) + +pub vortex_row::codec::RowWidth::Variable + +impl core::clone::Clone for vortex_row::codec::RowWidth + +pub fn vortex_row::codec::RowWidth::clone(&self) -> vortex_row::codec::RowWidth + +impl core::cmp::Eq for vortex_row::codec::RowWidth + +impl core::cmp::PartialEq for vortex_row::codec::RowWidth + +pub fn vortex_row::codec::RowWidth::eq(&self, &vortex_row::codec::RowWidth) -> bool + +impl core::fmt::Debug for vortex_row::codec::RowWidth + +pub fn vortex_row::codec::RowWidth::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_row::codec::RowWidth + +impl core::marker::StructuralPartialEq for vortex_row::codec::RowWidth + +pub const vortex_row::codec::BOOL_ENCODED_SIZE: u32 + +pub const vortex_row::codec::VARLEN_BLOCK_SIZE: usize + +pub const vortex_row::codec::VARLEN_BLOCK_TOTAL: usize + +pub trait vortex_row::codec::RowEncode: core::marker::Copy + +pub fn vortex_row::codec::RowEncode::encode_to(self, &mut [u8], bool) + +impl vortex_row::codec::RowEncode for f32 + +pub fn f32::encode_to(self, &mut [u8], bool) + +impl vortex_row::codec::RowEncode for f64 + +pub fn f64::encode_to(self, &mut [u8], bool) + +impl vortex_row::codec::RowEncode for half::binary16::f16 + +pub fn half::binary16::f16::encode_to(self, &mut [u8], bool) + +impl vortex_row::codec::RowEncode for i128 + +pub fn i128::encode_to(self, &mut [u8], bool) + +impl vortex_row::codec::RowEncode for i16 + +pub fn i16::encode_to(self, &mut [u8], bool) + +impl vortex_row::codec::RowEncode for i32 + +pub fn i32::encode_to(self, &mut [u8], bool) + +impl vortex_row::codec::RowEncode for i64 + +pub fn i64::encode_to(self, &mut [u8], bool) + +impl vortex_row::codec::RowEncode for i8 + +pub fn i8::encode_to(self, &mut [u8], bool) + +impl vortex_row::codec::RowEncode for u16 + +pub fn u16::encode_to(self, &mut [u8], bool) + +impl vortex_row::codec::RowEncode for u32 + +pub fn u32::encode_to(self, &mut [u8], bool) + +impl vortex_row::codec::RowEncode for u64 + +pub fn u64::encode_to(self, &mut [u8], bool) + +impl vortex_row::codec::RowEncode for u8 + +pub fn u8::encode_to(self, &mut [u8], bool) + +pub fn vortex_row::codec::encode_scalar(&vortex_array::scalar::Scalar, vortex_row::options::SortField) -> vortex_error::VortexResult + +pub fn vortex_row::codec::encode_scalar_bool(core::option::Option, vortex_row::options::SortField, &mut vortex_buffer::ByteBufferMut) + +pub fn vortex_row::codec::encode_scalar_null(vortex_row::options::SortField, bool, &mut vortex_buffer::ByteBufferMut) + +pub fn vortex_row::codec::encode_scalar_primitive(vortex_array::dtype::ptype::PType, vortex_array::scalar::typed_view::primitive::pvalue::PValue, vortex_row::options::SortField, bool, &mut vortex_buffer::ByteBufferMut) -> vortex_error::VortexResult<()> + +pub fn vortex_row::codec::encoded_size_for_scalar(&vortex_array::scalar::Scalar, vortex_row::options::SortField) -> vortex_error::VortexResult + +pub fn vortex_row::codec::field_encode(&vortex_array::canonical::Canonical, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()> + +pub fn vortex_row::codec::field_size(&vortex_array::canonical::Canonical, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()> + +pub fn vortex_row::codec::row_width_for_dtype(&vortex_array::dtype::DType) -> vortex_error::VortexResult + pub mod vortex_row::options pub struct vortex_row::options::RowEncodeOptions diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs new file mode 100644 index 00000000000..73aa7a37db4 --- /dev/null +++ b/vortex-row/src/codec.rs @@ -0,0 +1,667 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#![allow( + clippy::cast_possible_truncation, + clippy::expect_used, + reason = "row encoding indexes into u32-sized buffers; lengths are validated to fit in u32 elsewhere" +)] + +//! Pure byte-encoding kernels for row-oriented output, operating on `Canonical` variants. +//! +//! The encoded byte format produces a lexicographically byte-comparable representation: +//! comparing the byte slices of two encoded rows yields the same ordering as the +//! original logical (tuple) comparison of their values, modulo nulls placement and +//! descending-ness as configured by [`SortField`]. +//! +//! Conventions: +//! - Every value is preceded by a 1-byte sentinel that orders nulls relative to non-nulls. +//! - For `descending`, only the **value** bytes are bit-inverted (XOR with 0xFF), not the +//! sentinel. +//! - Fixed-width integers are big-endian, with the sign bit flipped for signed types. +//! - Floats are bit-pattern big-endian with sign-aware mask: non-negative flips the top +//! bit; negative flips all bits. +//! +//! This commit covers only the fixed-width canonical variants (Null, Bool, Primitive, +//! Decimal); variable-length and nested canonical variants land in later commits. + +use vortex_array::Canonical; +use vortex_array::ExecutionCtx; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::DecimalArray; +use vortex_array::arrays::NullArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::dtype::DType; +use vortex_array::dtype::DecimalType; +use vortex_array::dtype::NativePType; +use vortex_array::dtype::PType; +use vortex_array::dtype::half::f16; +use vortex_array::match_each_native_ptype; +use vortex_buffer::ByteBufferMut; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; + +use crate::options::SortField; + +/// Size in bytes of the encoded form of a single bool value (sentinel + 1 content byte). +pub const BOOL_ENCODED_SIZE: u32 = 2; + +/// Block size used in the variable-length encoding. +pub const VARLEN_BLOCK_SIZE: usize = 32; +/// Total bytes per varlen block including the trailing continuation marker. +pub const VARLEN_BLOCK_TOTAL: usize = VARLEN_BLOCK_SIZE + 1; + +/// Returns the size in bytes of the encoded form of a variable-length value of the given length. +#[inline] +#[allow( + dead_code, + reason = "used once varlen support lands in a follow-up commit" +)] +fn encoded_size_for_varlen(len: usize) -> u32 { + // 1 sentinel + ceil(len/32)*33 content bytes (or 1 zero terminator if empty) + if len == 0 { + 1 + 1 + } else { + let blocks = len.div_ceil(VARLEN_BLOCK_SIZE); + 1 + (blocks as u32) * (VARLEN_BLOCK_TOTAL as u32) + } +} + +/// Constant per-row size in bytes for fixed-width encodings (including 1-byte sentinel). +#[inline] +const fn encoded_size_for_fixed(value_bytes: u32) -> u32 { + 1 + value_bytes +} + +/// Per-row width classification for a column. +/// +/// `Fixed(w)` means every row encodes to exactly `w` bytes (sentinel + value), regardless +/// of null-ness or value. `Variable` means per-row sizes depend on the data (Utf8/Binary, +/// List, or any composite that recurses through a variable-width field). +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum RowWidth { + /// Per-row width is the same constant for every row in the column. + Fixed(u32), + /// Per-row width is data-dependent. + Variable, +} + +/// Classify a column's per-row encoded width by inspecting only its [`DType`]. +/// +/// Returns `Fixed(w)` when every row encodes to exactly `w` bytes (sentinel + value), +/// regardless of null-ness or value. Returns `Variable` when per-row sizes depend on the +/// data. +/// +/// Classification does not depend on the [`SortField`]: null-vs-non-null encoding width is +/// the same for fixed-width types (the sentinel byte plus zero-fill for nulls). +/// +/// # Errors +/// +/// Returns an error for dtypes that the row encoder does not yet support. Variable-length +/// dtypes (Utf8/Binary), nested dtypes (Struct/FixedSizeList/Extension), and +/// Variant/Union/List arrive in later commits. +pub fn row_width_for_dtype(dtype: &DType) -> VortexResult { + match dtype { + DType::Null => Ok(RowWidth::Fixed(1)), + DType::Bool(_) => Ok(RowWidth::Fixed(BOOL_ENCODED_SIZE)), + DType::Primitive(ptype, _) => Ok(RowWidth::Fixed(encoded_size_for_fixed( + ptype.byte_width() as u32, + ))), + DType::Decimal(dt, _) => { + let vt = DecimalType::smallest_decimal_value_type(dt); + Ok(RowWidth::Fixed(encoded_size_for_fixed( + vt.byte_width() as u32 + ))) + } + DType::Utf8(_) | DType::Binary(_) => { + vortex_bail!("row encoding for {} is not yet supported", dtype) + } + DType::Struct(..) | DType::FixedSizeList(..) | DType::List(..) | DType::Extension(..) => { + vortex_bail!("row encoding for {} is not yet supported", dtype) + } + DType::Variant(_) => { + vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)") + } + DType::Union(_) => vortex_bail!("row encoding does not support Union arrays"), + } +} + +/// Compute the per-row size in bytes for the given canonical view, adding into `sizes`. +/// +/// `sizes` is expected to be initialized (typically zeroed). This function *adds* the +/// per-row size to each entry so multiple columns can accumulate into the same buffer. +/// +/// # Errors +/// +/// Returns an error for unsupported canonical variants. Variable-length and nested +/// variants land in later commits. +pub fn field_size( + canonical: &Canonical, + _field: SortField, + sizes: &mut [u32], + _ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + match canonical { + Canonical::Null(arr) => add_size_null(arr, sizes), + Canonical::Bool(_) => add_size_const(sizes, encoded_size_for_fixed(1)), + Canonical::Primitive(arr) => add_size_primitive(arr, sizes), + Canonical::Decimal(arr) => add_size_decimal(arr, sizes), + Canonical::VarBinView(_) + | Canonical::Struct(_) + | Canonical::FixedSizeList(_) + | Canonical::Extension(_) + | Canonical::List(_) => vortex_bail!( + "row encoding does not yet support canonical type {:?}", + canonical.dtype() + ), + Canonical::Variant(_) => { + vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)") + } + } + Ok(()) +} + +/// Encode each row's bytes for the given canonical view into `out`, writing starting at +/// `offsets[i] + cursors[i]` for row `i` and advancing `cursors[i]` by the number of +/// bytes written. +/// +/// After this call returns successfully, `cursors[i]` will have advanced by exactly the +/// per-row contribution previously computed by [`field_size`] for the same column. +pub fn field_encode( + canonical: &Canonical, + field: SortField, + offsets: &[u32], + cursors: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + match canonical { + Canonical::Null(arr) => encode_null(arr, field, offsets, cursors, out), + Canonical::Bool(arr) => encode_bool(arr, field, offsets, cursors, out, ctx)?, + Canonical::Primitive(arr) => encode_primitive(arr, field, offsets, cursors, out, ctx)?, + Canonical::Decimal(arr) => encode_decimal(arr, field, offsets, cursors, out, ctx)?, + Canonical::VarBinView(_) + | Canonical::Struct(_) + | Canonical::FixedSizeList(_) + | Canonical::Extension(_) + | Canonical::List(_) => vortex_bail!( + "row encoding does not yet support canonical type {:?}", + canonical.dtype() + ), + Canonical::Variant(_) => { + vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)") + } + } + Ok(()) +} + +fn add_size_const(sizes: &mut [u32], add: u32) { + for s in sizes.iter_mut() { + *s += add; + } +} + +fn add_size_null(arr: &NullArray, sizes: &mut [u32]) { + debug_assert_eq!(arr.len(), sizes.len()); + // Just a sentinel byte per row. + for s in sizes.iter_mut() { + *s += 1; + } +} + +fn add_size_primitive(arr: &PrimitiveArray, sizes: &mut [u32]) { + let width = arr.ptype().byte_width() as u32; + add_size_const(sizes, encoded_size_for_fixed(width)); +} + +fn add_size_decimal(arr: &DecimalArray, sizes: &mut [u32]) { + let width = arr.values_type().byte_width() as u32; + add_size_const(sizes, encoded_size_for_fixed(width)); +} + +fn encode_null( + arr: &NullArray, + field: SortField, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], +) { + let sentinel = field.null_sentinel(); + for i in 0..arr.len() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + out[pos] = sentinel; + col_offset[i] += 1; + } +} + +fn encode_bool( + arr: &BoolArray, + field: SortField, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?; + let bits = arr.clone().into_bit_buffer(); + let non_null = field.non_null_sentinel(); + let null = field.null_sentinel(); + let xor = if field.descending { 0xFF } else { 0x00 }; + for i in 0..bits.len() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + if mask.value(i) { + out[pos] = non_null; + // false=0x01, true=0x02 so false < true; XOR for descending + let raw = if bits.value(i) { 0x02u8 } else { 0x01u8 }; + out[pos + 1] = raw ^ xor; + } else { + out[pos] = null; + out[pos + 1] = 0; + } + col_offset[i] += BOOL_ENCODED_SIZE; + } + Ok(()) +} + +fn encode_primitive( + arr: &PrimitiveArray, + field: SortField, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + match_each_native_ptype!(arr.ptype(), |T| { + encode_primitive_typed::(arr, field, row_offsets, col_offset, out, ctx)?; + }); + Ok(()) +} + +fn encode_primitive_typed( + arr: &PrimitiveArray, + field: SortField, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?; + let slice: &[T] = arr.as_slice(); + let non_null = field.non_null_sentinel(); + let null = field.null_sentinel(); + let value_bytes = size_of::(); + for (i, &v) in slice.iter().enumerate() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + if mask.value(i) { + out[pos] = non_null; + v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending); + } else { + out[pos] = null; + // Zero-fill the value bytes. + for b in &mut out[pos + 1..pos + 1 + value_bytes] { + *b = 0; + } + } + col_offset[i] += encoded_size_for_fixed(value_bytes as u32); + } + Ok(()) +} + +fn encode_decimal( + arr: &DecimalArray, + field: SortField, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?; + match arr.values_type() { + DecimalType::I8 => { + encode_decimal_typed::(arr, &mask, field, row_offsets, col_offset, out) + } + DecimalType::I16 => { + encode_decimal_typed::(arr, &mask, field, row_offsets, col_offset, out) + } + DecimalType::I32 => { + encode_decimal_typed::(arr, &mask, field, row_offsets, col_offset, out) + } + DecimalType::I64 => { + encode_decimal_typed::(arr, &mask, field, row_offsets, col_offset, out) + } + DecimalType::I128 => { + encode_decimal_typed::(arr, &mask, field, row_offsets, col_offset, out) + } + DecimalType::I256 => { + vortex_bail!("row encoding for Decimal256 is not yet implemented") + } + } + Ok(()) +} + +fn encode_decimal_typed( + arr: &DecimalArray, + mask: &vortex_mask::Mask, + field: SortField, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], +) where + T: vortex_array::dtype::NativeDecimalType + RowEncode, +{ + let non_null = field.non_null_sentinel(); + let null = field.null_sentinel(); + let value_bytes = size_of::(); + let total = encoded_size_for_fixed(value_bytes as u32); + let slice = arr.buffer::(); + for i in 0..slice.len() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + if mask.value(i) { + out[pos] = non_null; + slice[i].encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending); + } else { + out[pos] = null; + for b in &mut out[pos + 1..pos + 1 + value_bytes] { + *b = 0; + } + } + col_offset[i] += total; + } +} + +/// Internal trait for encoding a fixed-width native value into byte slots. +/// +/// Implementations must produce a sequence of `size_of::()` bytes that is +/// lexicographically byte-comparable according to the natural ordering of the type. +pub trait RowEncode: Copy { + /// Encode this value into `out`, inverting the bytes for descending order. + fn encode_to(self, out: &mut [u8], descending: bool); +} + +macro_rules! impl_row_encode_unsigned { + ($t:ty) => { + impl RowEncode for $t { + #[inline] + fn encode_to(self, out: &mut [u8], descending: bool) { + let bytes = self.to_be_bytes(); + if descending { + for (i, b) in bytes.iter().enumerate() { + out[i] = b ^ 0xFF; + } + } else { + out.copy_from_slice(&bytes); + } + } + } + }; +} + +macro_rules! impl_row_encode_signed { + ($t:ty) => { + impl RowEncode for $t { + #[inline] + fn encode_to(self, out: &mut [u8], descending: bool) { + let mut bytes = self.to_be_bytes(); + // Flip sign bit so negatives < non-negatives lexicographically. + bytes[0] ^= 0x80; + if descending { + for (i, b) in bytes.iter().enumerate() { + out[i] = b ^ 0xFF; + } + } else { + out.copy_from_slice(&bytes); + } + } + } + }; +} + +impl_row_encode_unsigned!(u8); +impl_row_encode_unsigned!(u16); +impl_row_encode_unsigned!(u32); +impl_row_encode_unsigned!(u64); +impl_row_encode_signed!(i8); +impl_row_encode_signed!(i16); +impl_row_encode_signed!(i32); +impl_row_encode_signed!(i64); +impl_row_encode_signed!(i128); + +impl RowEncode for f32 { + fn encode_to(self, out: &mut [u8], descending: bool) { + let bits = self.to_bits(); + let mask: u32 = if (bits >> 31) == 0 { + 0x8000_0000 + } else { + 0xFFFF_FFFF + }; + let mut bytes = (bits ^ mask).to_be_bytes(); + if descending { + for b in bytes.iter_mut() { + *b ^= 0xFF; + } + } + out.copy_from_slice(&bytes); + } +} + +impl RowEncode for f64 { + fn encode_to(self, out: &mut [u8], descending: bool) { + let bits = self.to_bits(); + let mask: u64 = if (bits >> 63) == 0 { + 0x8000_0000_0000_0000 + } else { + 0xFFFF_FFFF_FFFF_FFFF + }; + let mut bytes = (bits ^ mask).to_be_bytes(); + if descending { + for b in bytes.iter_mut() { + *b ^= 0xFF; + } + } + out.copy_from_slice(&bytes); + } +} + +impl RowEncode for f16 { + fn encode_to(self, out: &mut [u8], descending: bool) { + let bits = self.to_bits(); + let mask: u16 = if (bits >> 15) == 0 { 0x8000 } else { 0xFFFF }; + let mut bytes = (bits ^ mask).to_be_bytes(); + if descending { + for b in bytes.iter_mut() { + *b ^= 0xFF; + } + } + out.copy_from_slice(&bytes); + } +} + +/// Encode a single scalar primitive value of a known PType into a buffer slot. +pub fn encode_scalar_primitive( + ptype: PType, + value: vortex_array::scalar::PValue, + field: SortField, + is_null: bool, + out: &mut ByteBufferMut, +) -> VortexResult<()> { + if is_null { + out.push(field.null_sentinel()); + return Ok(()); + } + out.push(field.non_null_sentinel()); + let width = ptype.byte_width(); + let mut tmp = [0u8; 16]; + let buf = &mut tmp[..width]; + match_each_native_ptype!( + ptype, + integral: |T| { + let v: T = T::try_from(value)?; + v.encode_to(buf, field.descending); + }, + floating: |T| { + let v: T = T::try_from(value)?; + v.encode_to(buf, field.descending); + } + ); + out.extend_from_slice(buf); + Ok(()) +} + +/// Encode a single boolean value. +pub fn encode_scalar_bool(value: Option, field: SortField, out: &mut ByteBufferMut) { + match value { + None => { + out.push(field.null_sentinel()); + out.push(0); + } + Some(b) => { + out.push(field.non_null_sentinel()); + let raw = if b { 0x02u8 } else { 0x01u8 }; + let xor = if field.descending { 0xFFu8 } else { 0 }; + out.push(raw ^ xor); + } + } +} + +/// Encode a single null-type value (only the sentinel). +pub fn encode_scalar_null(field: SortField, is_null: bool, out: &mut ByteBufferMut) { + if is_null { + out.push(field.null_sentinel()); + } else { + out.push(field.non_null_sentinel()); + } +} + +/// Returns the per-row encoded size for a scalar value (used for the Constant fast path). +pub fn encoded_size_for_scalar( + scalar: &vortex_array::scalar::Scalar, + _field: SortField, +) -> VortexResult { + if scalar.is_null() { + match scalar.dtype() { + DType::Null => Ok(1), + DType::Bool(_) => Ok(BOOL_ENCODED_SIZE), + DType::Primitive(ptype, _) => Ok(encoded_size_for_fixed(ptype.byte_width() as u32)), + DType::Decimal(dt, _) => { + let vt = DecimalType::smallest_decimal_value_type(dt); + Ok(encoded_size_for_fixed(vt.byte_width() as u32)) + } + _ => vortex_bail!( + "unsupported scalar dtype for row encoding: {}", + scalar.dtype() + ), + } + } else { + match scalar.dtype() { + DType::Null => Ok(1), + DType::Bool(_) => Ok(BOOL_ENCODED_SIZE), + DType::Primitive(ptype, _) => Ok(encoded_size_for_fixed(ptype.byte_width() as u32)), + DType::Decimal(..) => { + let dec = scalar.as_decimal(); + let vt = dec + .decimal_value() + .map(|v| v.decimal_type()) + .unwrap_or(DecimalType::I128); + Ok(encoded_size_for_fixed(vt.byte_width() as u32)) + } + _ => vortex_bail!( + "unsupported scalar dtype for row encoding: {}", + scalar.dtype() + ), + } + } +} + +/// Encode a single scalar value into a fresh `Bytes` buffer. +pub fn encode_scalar( + scalar: &vortex_array::scalar::Scalar, + field: SortField, +) -> VortexResult { + use vortex_array::scalar::PValue; + let size = encoded_size_for_scalar(scalar, field)? as usize; + let mut out = ByteBufferMut::with_capacity(size); + if scalar.is_null() { + match scalar.dtype() { + DType::Null => out.push(field.null_sentinel()), + DType::Bool(_) => { + out.push(field.null_sentinel()); + out.push(0); + } + DType::Primitive(ptype, _) => { + out.push(field.null_sentinel()); + let width = ptype.byte_width(); + for _ in 0..width { + out.push(0); + } + } + DType::Decimal(dt, _) => { + out.push(field.null_sentinel()); + let vt = DecimalType::smallest_decimal_value_type(dt); + for _ in 0..vt.byte_width() { + out.push(0); + } + } + _ => vortex_bail!( + "unsupported scalar dtype for row encoding: {}", + scalar.dtype() + ), + } + } else { + match scalar.dtype() { + DType::Null => out.push(field.non_null_sentinel()), + DType::Bool(_) => { + let v = scalar.as_bool().value().unwrap_or(false); + encode_scalar_bool(Some(v), field, &mut out); + } + DType::Primitive(ptype, _) => { + let v: PValue = scalar + .as_primitive() + .pvalue() + .ok_or_else(|| vortex_error::vortex_err!("missing primitive value"))?; + encode_scalar_primitive(*ptype, v, field, false, &mut out)?; + } + DType::Decimal(..) => { + let dec = scalar.as_decimal(); + out.push(field.non_null_sentinel()); + let value = dec + .decimal_value() + .ok_or_else(|| vortex_error::vortex_err!("missing decimal value"))?; + match value { + vortex_array::scalar::DecimalValue::I8(v) => { + let mut tmp = [0u8; 1]; + v.encode_to(&mut tmp, field.descending); + out.extend_from_slice(&tmp); + } + vortex_array::scalar::DecimalValue::I16(v) => { + let mut tmp = [0u8; 2]; + v.encode_to(&mut tmp, field.descending); + out.extend_from_slice(&tmp); + } + vortex_array::scalar::DecimalValue::I32(v) => { + let mut tmp = [0u8; 4]; + v.encode_to(&mut tmp, field.descending); + out.extend_from_slice(&tmp); + } + vortex_array::scalar::DecimalValue::I64(v) => { + let mut tmp = [0u8; 8]; + v.encode_to(&mut tmp, field.descending); + out.extend_from_slice(&tmp); + } + vortex_array::scalar::DecimalValue::I128(v) => { + let mut tmp = [0u8; 16]; + v.encode_to(&mut tmp, field.descending); + out.extend_from_slice(&tmp); + } + vortex_array::scalar::DecimalValue::I256(_) => { + vortex_bail!("row encoding for Decimal256 is not yet implemented") + } + } + } + _ => vortex_bail!( + "unsupported scalar dtype for row encoding: {}", + scalar.dtype() + ), + } + } + Ok(out.freeze().into_inner()) +} diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs index 9e62f25caf2..bdac4c8f48e 100644 --- a/vortex-row/src/lib.rs +++ b/vortex-row/src/lib.rs @@ -6,6 +6,7 @@ //! Subsequent commits add the encoder, decoder helpers, and per-encoding fast paths. //! This commit only establishes the crate skeleton and an `initialize` stub. +pub mod codec; pub mod options; pub use options::RowEncodeOptions; From d3f3da4b2bf165549876b7848f6aa8c1fddb40fe Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 22:06:19 +0000 Subject: [PATCH 04/10] vortex-row: codec for varlen canonical types Extend the codec to handle Utf8/Binary via VarBinView arrays. Each value encodes as a 1-byte sentinel followed by 32-byte chunks: every full chunk has a 0xFF continuation marker; the final partial chunk pads with zeros and writes the partial length (1..=32) as its trailing byte. `encode_varlen_value` uses the simple byte-at-a-time XOR loop here; a faster `copy_nonoverlapping` + stamped continuation version replaces it in PR 2. `encode_varbinview` uses `arr.with_iterator(...)` for both the nullable and non-nullable branches; a direct view walk for the no-nulls branch lands in PR 2 too. `row_width_for_dtype` now returns `Variable` for Utf8/Binary; the size pass and encode dispatchers route through `add_size_varbinview` / `encode_varbinview` correspondingly. The scalar encoder gains `encode_scalar_varlen` and the matching Utf8/Binary arms. Signed-off-by: Claude --- vortex-row/public-api.lock | 2 + vortex-row/src/codec.rs | 147 ++++++++++++++++++++++++++++++++++--- 2 files changed, 137 insertions(+), 12 deletions(-) diff --git a/vortex-row/public-api.lock b/vortex-row/public-api.lock index 4990e30ba16..1afc1f05442 100644 --- a/vortex-row/public-api.lock +++ b/vortex-row/public-api.lock @@ -92,6 +92,8 @@ pub fn vortex_row::codec::encode_scalar_null(vortex_row::options::SortField, boo pub fn vortex_row::codec::encode_scalar_primitive(vortex_array::dtype::ptype::PType, vortex_array::scalar::typed_view::primitive::pvalue::PValue, vortex_row::options::SortField, bool, &mut vortex_buffer::ByteBufferMut) -> vortex_error::VortexResult<()> +pub fn vortex_row::codec::encode_scalar_varlen(core::option::Option<&[u8]>, vortex_row::options::SortField, &mut vortex_buffer::ByteBufferMut) + pub fn vortex_row::codec::encoded_size_for_scalar(&vortex_array::scalar::Scalar, vortex_row::options::SortField) -> vortex_error::VortexResult pub fn vortex_row::codec::field_encode(&vortex_array::canonical::Canonical, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()> diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs index 73aa7a37db4..4f70d80e5ae 100644 --- a/vortex-row/src/codec.rs +++ b/vortex-row/src/codec.rs @@ -27,10 +27,12 @@ use vortex_array::Canonical; use vortex_array::ExecutionCtx; +use vortex_array::accessor::ArrayAccessor; use vortex_array::arrays::BoolArray; use vortex_array::arrays::DecimalArray; use vortex_array::arrays::NullArray; use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::VarBinViewArray; use vortex_array::dtype::DType; use vortex_array::dtype::DecimalType; use vortex_array::dtype::NativePType; @@ -53,10 +55,6 @@ pub const VARLEN_BLOCK_TOTAL: usize = VARLEN_BLOCK_SIZE + 1; /// Returns the size in bytes of the encoded form of a variable-length value of the given length. #[inline] -#[allow( - dead_code, - reason = "used once varlen support lands in a follow-up commit" -)] fn encoded_size_for_varlen(len: usize) -> u32 { // 1 sentinel + ceil(len/32)*33 content bytes (or 1 zero terminator if empty) if len == 0 { @@ -113,9 +111,7 @@ pub fn row_width_for_dtype(dtype: &DType) -> VortexResult { vt.byte_width() as u32 ))) } - DType::Utf8(_) | DType::Binary(_) => { - vortex_bail!("row encoding for {} is not yet supported", dtype) - } + DType::Utf8(_) | DType::Binary(_) => Ok(RowWidth::Variable), DType::Struct(..) | DType::FixedSizeList(..) | DType::List(..) | DType::Extension(..) => { vortex_bail!("row encoding for {} is not yet supported", dtype) } @@ -139,15 +135,15 @@ pub fn field_size( canonical: &Canonical, _field: SortField, sizes: &mut [u32], - _ctx: &mut ExecutionCtx, + ctx: &mut ExecutionCtx, ) -> VortexResult<()> { match canonical { Canonical::Null(arr) => add_size_null(arr, sizes), Canonical::Bool(_) => add_size_const(sizes, encoded_size_for_fixed(1)), Canonical::Primitive(arr) => add_size_primitive(arr, sizes), Canonical::Decimal(arr) => add_size_decimal(arr, sizes), - Canonical::VarBinView(_) - | Canonical::Struct(_) + Canonical::VarBinView(arr) => add_size_varbinview(arr, sizes, ctx)?, + Canonical::Struct(_) | Canonical::FixedSizeList(_) | Canonical::Extension(_) | Canonical::List(_) => vortex_bail!( @@ -180,8 +176,8 @@ pub fn field_encode( Canonical::Bool(arr) => encode_bool(arr, field, offsets, cursors, out, ctx)?, Canonical::Primitive(arr) => encode_primitive(arr, field, offsets, cursors, out, ctx)?, Canonical::Decimal(arr) => encode_decimal(arr, field, offsets, cursors, out, ctx)?, - Canonical::VarBinView(_) - | Canonical::Struct(_) + Canonical::VarBinView(arr) => encode_varbinview(arr, field, offsets, cursors, out, ctx)?, + Canonical::Struct(_) | Canonical::FixedSizeList(_) | Canonical::Extension(_) | Canonical::List(_) => vortex_bail!( @@ -219,6 +215,25 @@ fn add_size_decimal(arr: &DecimalArray, sizes: &mut [u32]) { add_size_const(sizes, encoded_size_for_fixed(width)); } +fn add_size_varbinview( + arr: &VarBinViewArray, + sizes: &mut [u32], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?; + let views = arr.views(); + for (i, view) in views.iter().enumerate() { + let valid = mask.value(i); + if !valid { + sizes[i] += 1; // sentinel only + } else { + let len = view.len() as usize; + sizes[i] += encoded_size_for_varlen(len); + } + } + Ok(()) +} + fn encode_null( arr: &NullArray, field: SortField, @@ -369,6 +384,69 @@ fn encode_decimal_typed( } } +fn encode_varbinview( + arr: &VarBinViewArray, + field: SortField, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?; + let non_null = field.non_null_sentinel(); + let null = field.null_sentinel(); + + arr.with_iterator(|iter| { + for (i, maybe) in iter.enumerate() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + if !mask.value(i) { + out[pos] = null; + col_offset[i] += 1; + continue; + } + let bytes: &[u8] = maybe.unwrap_or(&[]); + out[pos] = non_null; + let written = encode_varlen_value(bytes, &mut out[pos + 1..], field.descending); + col_offset[i] += 1 + written; + } + }); + Ok(()) +} + +/// Encode a variable-length byte slice into `out` in 32-byte blocks with +/// continuation markers. Returns the number of bytes written. +fn encode_varlen_value(bytes: &[u8], out: &mut [u8], descending: bool) -> u32 { + let xor = if descending { 0xFFu8 } else { 0x00 }; + if bytes.is_empty() { + // Single zero terminator. + out[0] = xor; + return 1; + } + let mut written = 0usize; + let mut remaining = bytes; + while remaining.len() > VARLEN_BLOCK_SIZE { + // Full block, continuation marker 0xFF (then XORed if descending). + let block = &remaining[..VARLEN_BLOCK_SIZE]; + for (i, &b) in block.iter().enumerate() { + out[written + i] = b ^ xor; + } + out[written + VARLEN_BLOCK_SIZE] = 0xFF ^ xor; + written += VARLEN_BLOCK_TOTAL; + remaining = &remaining[VARLEN_BLOCK_SIZE..]; + } + // Final partial block: pad with zeros, last byte = remaining.len() (1..=32). + let n = remaining.len(); + for (i, &b) in remaining.iter().enumerate() { + out[written + i] = b ^ xor; + } + for j in n..VARLEN_BLOCK_SIZE { + out[written + j] = xor; + } + out[written + VARLEN_BLOCK_SIZE] = (n as u8) ^ xor; + written += VARLEN_BLOCK_TOTAL; + written as u32 +} + /// Internal trait for encoding a fixed-width native value into byte slots. /// /// Implementations must produce a sequence of `size_of::()` bytes that is @@ -507,6 +585,27 @@ pub fn encode_scalar_primitive( Ok(()) } +/// Encode a single varlen value into a buffer. +pub fn encode_scalar_varlen(value: Option<&[u8]>, field: SortField, out: &mut ByteBufferMut) { + match value { + None => out.push(field.null_sentinel()), + Some(bytes) => { + out.push(field.non_null_sentinel()); + let needed = if bytes.is_empty() { + 1 + } else { + bytes.len().div_ceil(VARLEN_BLOCK_SIZE) * VARLEN_BLOCK_TOTAL + }; + let start = out.len(); + for _ in 0..needed { + out.push(0); + } + let written = encode_varlen_value(bytes, &mut out[start..], field.descending); + debug_assert_eq!(written as usize, needed); + } + } +} + /// Encode a single boolean value. pub fn encode_scalar_bool(value: Option, field: SortField, out: &mut ByteBufferMut) { match value { @@ -546,6 +645,7 @@ pub fn encoded_size_for_scalar( let vt = DecimalType::smallest_decimal_value_type(dt); Ok(encoded_size_for_fixed(vt.byte_width() as u32)) } + DType::Utf8(_) | DType::Binary(_) => Ok(1), _ => vortex_bail!( "unsupported scalar dtype for row encoding: {}", scalar.dtype() @@ -564,6 +664,18 @@ pub fn encoded_size_for_scalar( .unwrap_or(DecimalType::I128); Ok(encoded_size_for_fixed(vt.byte_width() as u32)) } + DType::Utf8(_) => { + let bs = scalar + .as_utf8() + .value() + .map(|s| s.as_str().len()) + .unwrap_or(0); + Ok(encoded_size_for_varlen(bs)) + } + DType::Binary(_) => { + let bs = scalar.as_binary().value().map(|b| b.len()).unwrap_or(0); + Ok(encoded_size_for_varlen(bs)) + } _ => vortex_bail!( "unsupported scalar dtype for row encoding: {}", scalar.dtype() @@ -601,6 +713,7 @@ pub fn encode_scalar( out.push(0); } } + DType::Utf8(_) | DType::Binary(_) => out.push(field.null_sentinel()), _ => vortex_bail!( "unsupported scalar dtype for row encoding: {}", scalar.dtype() @@ -657,6 +770,16 @@ pub fn encode_scalar( } } } + DType::Utf8(_) => { + let v = scalar.as_utf8(); + let bytes = v.value().map(|s| s.as_str().as_bytes()).unwrap_or(&[]); + encode_scalar_varlen(Some(bytes), field, &mut out); + } + DType::Binary(_) => { + let v = scalar.as_binary(); + let bytes = v.value().map(|b| b.as_slice()).unwrap_or(&[]); + encode_scalar_varlen(Some(bytes), field, &mut out); + } _ => vortex_bail!( "unsupported scalar dtype for row encoding: {}", scalar.dtype() From 570d358939f4d8e7919ea377773c0cfdd6637fe8 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 22:08:07 +0000 Subject: [PATCH 05/10] vortex-row: codec for nested canonical types Extend the codec to handle Struct, FixedSizeList, and Extension canonical variants. Each nested row encodes as `outer_sentinel | child bytes...`; for null rows the child bytes are zero-filled after the recursive encoders run so two null rows compare equal regardless of which non-null values would have been written by the children. `row_width_for_dtype` recurses through Struct fields and FSL elements to return `Fixed(w)` when every leaf is fixed; otherwise `Variable`. Extension delegates to its storage dtype. List remains `Variable` and ListView still bails (the row encoder's output is itself a ListView, so nested ListView isn't a near-term use case). Variant and Union bail explicitly. Signed-off-by: Claude --- vortex-row/src/codec.rs | 227 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 216 insertions(+), 11 deletions(-) diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs index 4f70d80e5ae..8468301e5b3 100644 --- a/vortex-row/src/codec.rs +++ b/vortex-row/src/codec.rs @@ -30,9 +30,15 @@ use vortex_array::ExecutionCtx; use vortex_array::accessor::ArrayAccessor; use vortex_array::arrays::BoolArray; use vortex_array::arrays::DecimalArray; +use vortex_array::arrays::ExtensionArray; +use vortex_array::arrays::FixedSizeListArray; use vortex_array::arrays::NullArray; use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::StructArray; use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::extension::ExtensionArrayExt; +use vortex_array::arrays::fixed_size_list::FixedSizeListArrayExt; +use vortex_array::arrays::struct_::StructArrayExt; use vortex_array::dtype::DType; use vortex_array::dtype::DecimalType; use vortex_array::dtype::NativePType; @@ -112,9 +118,28 @@ pub fn row_width_for_dtype(dtype: &DType) -> VortexResult { ))) } DType::Utf8(_) | DType::Binary(_) => Ok(RowWidth::Variable), - DType::Struct(..) | DType::FixedSizeList(..) | DType::List(..) | DType::Extension(..) => { - vortex_bail!("row encoding for {} is not yet supported", dtype) + DType::FixedSizeList(elem, n, _) => match row_width_for_dtype(elem)? { + // FSL is fixed iff its element type is fixed. Add a sentinel byte for the FSL + // itself, then `n` copies of the element width. + RowWidth::Fixed(w) => { + let body = w.saturating_mul(*n); + Ok(RowWidth::Fixed(body.saturating_add(1))) + } + RowWidth::Variable => Ok(RowWidth::Variable), + }, + DType::Struct(fields, _) => { + // Struct is fixed iff all its fields are fixed; sum their widths plus a sentinel. + let mut total: u32 = 1; // outer sentinel + for field_dtype in fields.fields() { + match row_width_for_dtype(&field_dtype)? { + RowWidth::Fixed(w) => total = total.saturating_add(w), + RowWidth::Variable => return Ok(RowWidth::Variable), + } + } + Ok(RowWidth::Fixed(total)) } + DType::List(..) => Ok(RowWidth::Variable), + DType::Extension(ext) => row_width_for_dtype(ext.storage_dtype()), DType::Variant(_) => { vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)") } @@ -133,7 +158,7 @@ pub fn row_width_for_dtype(dtype: &DType) -> VortexResult { /// variants land in later commits. pub fn field_size( canonical: &Canonical, - _field: SortField, + field: SortField, sizes: &mut [u32], ctx: &mut ExecutionCtx, ) -> VortexResult<()> { @@ -143,10 +168,10 @@ pub fn field_size( Canonical::Primitive(arr) => add_size_primitive(arr, sizes), Canonical::Decimal(arr) => add_size_decimal(arr, sizes), Canonical::VarBinView(arr) => add_size_varbinview(arr, sizes, ctx)?, - Canonical::Struct(_) - | Canonical::FixedSizeList(_) - | Canonical::Extension(_) - | Canonical::List(_) => vortex_bail!( + Canonical::Struct(arr) => add_size_struct(arr, field, sizes, ctx)?, + Canonical::FixedSizeList(arr) => add_size_fsl(arr, field, sizes, ctx)?, + Canonical::Extension(arr) => add_size_extension(arr, field, sizes, ctx)?, + Canonical::List(_) => vortex_bail!( "row encoding does not yet support canonical type {:?}", canonical.dtype() ), @@ -177,10 +202,10 @@ pub fn field_encode( Canonical::Primitive(arr) => encode_primitive(arr, field, offsets, cursors, out, ctx)?, Canonical::Decimal(arr) => encode_decimal(arr, field, offsets, cursors, out, ctx)?, Canonical::VarBinView(arr) => encode_varbinview(arr, field, offsets, cursors, out, ctx)?, - Canonical::Struct(_) - | Canonical::FixedSizeList(_) - | Canonical::Extension(_) - | Canonical::List(_) => vortex_bail!( + Canonical::Struct(arr) => encode_struct(arr, field, offsets, cursors, out, ctx)?, + Canonical::FixedSizeList(arr) => encode_fsl(arr, field, offsets, cursors, out, ctx)?, + Canonical::Extension(arr) => encode_extension(arr, field, offsets, cursors, out, ctx)?, + Canonical::List(_) => vortex_bail!( "row encoding does not yet support canonical type {:?}", canonical.dtype() ), @@ -234,6 +259,60 @@ fn add_size_varbinview( Ok(()) } +fn add_size_struct( + arr: &StructArray, + field: SortField, + sizes: &mut [u32], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + // null sentinel: 1 byte per row. + for s in sizes.iter_mut() { + *s += 1; + } + // Each field adds its own per-row size. + for child in arr.iter_unmasked_fields() { + let canonical = child.clone().execute::(ctx)?; + field_size(&canonical, field, sizes, ctx)?; + } + Ok(()) +} + +fn add_size_fsl( + arr: &FixedSizeListArray, + field: SortField, + sizes: &mut [u32], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let n = arr.len(); + debug_assert_eq!(n, sizes.len()); + let list_size = arr.list_size() as usize; + let elements = arr.elements().clone().execute::(ctx)?; + debug_assert_eq!(elements.len(), n * list_size); + // Sizing: 1 sentinel + sum of element sizes (`list_size` per row). + // We compute element-wise sizes into a contiguous scratch buffer then reduce by row. + let mut elem_sizes = vec![0u32; n * list_size]; + field_size(&elements, field, &mut elem_sizes, ctx)?; + for i in 0..n { + let mut sum: u32 = 1; // sentinel + let base = i * list_size; + for j in 0..list_size { + sum = sum.saturating_add(elem_sizes[base + j]); + } + sizes[i] += sum; + } + Ok(()) +} + +fn add_size_extension( + arr: &ExtensionArray, + field: SortField, + sizes: &mut [u32], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let storage = arr.storage_array().clone().execute::(ctx)?; + field_size(&storage, field, sizes, ctx) +} + fn encode_null( arr: &NullArray, field: SortField, @@ -413,6 +492,132 @@ fn encode_varbinview( Ok(()) } +fn encode_struct( + arr: &StructArray, + field: SortField, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let n = arr.len(); + let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?; + let non_null = field.non_null_sentinel(); + let null = field.null_sentinel(); + + // First, write the sentinel for each row. We track the post-sentinel cursor offsets + // for the body in `body_cursors` (which start exactly at +1 of the input cursor). + // For null rows we additionally need to zero-fill the (uniform-width) field bytes, + // but because struct widths are variable in general, we record null indexes first + // and zero-fill after we know each row's contribution. + // + // To keep the implementation simple we: + // 1) advance the cursor past the sentinel, + // 2) recursively encode each field's bytes (the field encoders ignore nullness of + // the struct, but use their own per-field nullness), + // 3) for null struct rows, overwrite the body bytes with zeros so the encoded form + // depends only on the sentinel. + let body_start: Vec = (0..n).map(|i| col_offset[i] + 1).collect(); + for i in 0..n { + let pos = (row_offsets[i] + col_offset[i]) as usize; + out[pos] = if mask.value(i) { non_null } else { null }; + col_offset[i] += 1; + } + + for child in arr.iter_unmasked_fields() { + let canonical = child.clone().execute::(ctx)?; + field_encode(&canonical, field, row_offsets, col_offset, out, ctx)?; + } + + // Zero-fill body bytes of null rows (the field encoders may have written values). + for i in 0..n { + if !mask.value(i) { + let start = (row_offsets[i] + body_start[i]) as usize; + let end = (row_offsets[i] + col_offset[i]) as usize; + for b in &mut out[start..end] { + *b = 0; + } + } + } + + Ok(()) +} + +fn encode_fsl( + arr: &FixedSizeListArray, + field: SortField, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let n = arr.len(); + let list_size = arr.list_size() as usize; + let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?; + let non_null = field.non_null_sentinel(); + let null = field.null_sentinel(); + let elements = arr.elements().clone().execute::(ctx)?; + debug_assert_eq!(elements.len(), n * list_size); + + // Write sentinels and remember body start for null zero-fill. + let body_start: Vec = (0..n).map(|i| col_offset[i] + 1).collect(); + for i in 0..n { + let pos = (row_offsets[i] + col_offset[i]) as usize; + out[pos] = if mask.value(i) { non_null } else { null }; + col_offset[i] += 1; + } + + // Encode all `n * list_size` elements into the body. Build a fresh + // (offsets, cursors) pair where each element gets one slot. Then sum bytes back + // into the parent col_offset. + let mut elem_sizes = vec![0u32; n * list_size]; + field_size(&elements, field, &mut elem_sizes, ctx)?; + // Element offsets are sequential starting at each parent's current cursor position. + let mut elem_offsets = vec![0u32; n * list_size]; + for i in 0..n { + let mut acc = row_offsets[i] + col_offset[i]; + for j in 0..list_size { + elem_offsets[i * list_size + j] = acc; + acc = acc.saturating_add(elem_sizes[i * list_size + j]); + } + } + let mut elem_cursors = vec![0u32; n * list_size]; + field_encode(&elements, field, &elem_offsets, &mut elem_cursors, out, ctx)?; + // Advance the parent cursors by the total per-row element bytes. + for i in 0..n { + let mut sum: u32 = 0; + for j in 0..list_size { + sum = sum.saturating_add(elem_sizes[i * list_size + j]); + } + col_offset[i] = col_offset[i].saturating_add(sum); + } + + // Zero-fill null bodies. + for i in 0..n { + if !mask.value(i) { + let start = (row_offsets[i] + body_start[i]) as usize; + let end = (row_offsets[i] + col_offset[i]) as usize; + for b in &mut out[start..end] { + *b = 0; + } + } + } + + Ok(()) +} + +fn encode_extension( + arr: &ExtensionArray, + field: SortField, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let storage = arr.storage_array().clone().execute::(ctx)?; + field_encode(&storage, field, row_offsets, col_offset, out, ctx) +} + /// Encode a variable-length byte slice into `out` in 32-byte blocks with /// continuation markers. Returns the number of bytes written. fn encode_varlen_value(bytes: &[u8], out: &mut [u8], descending: bool) -> u32 { From 5374f3b65c42c2c7bb7a646e61a956d12f1aef0c Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 22:09:31 +0000 Subject: [PATCH 06/10] vortex-row: compute_sizes helper and RowSize ScalarFn Add the size-pass machinery used by both RowSize and the upcoming RowEncode pipeline. `compute_sizes` walks the N input columns once, classifying each via `row_width_for_dtype` and accumulating fixed-width-prefix sums in `fixed_per_row` while pushing per-row sums of variable-length columns into a lazily allocated `var_lengths` vec. The classification result (`ColKind` + `SizePassResult`) is private to the crate; RowEncode consumes it in a later commit to choose between the arithmetic and cursor encode paths. `RowSize` returns a `Struct { fixed: U32, var: U32 }` so callers can read the per-row width without realizing the constant `fixed` slot as a per-row buffer (it's a `ConstantArray`); the `var` slot is a `ConstantArray(0)` when no varlen column is present. `dispatch_size` is the fallback-only path for PR 1 (canonicalize, then codec::field_size). The `RowSizeKernel` trait exists but is unused; per- encoding fast paths and the inventory registry arrive in PR 3. `initialize()` does NOT register RowSize yet - that lands once RowEncode is in place, so the session-registered pair appears together. Signed-off-by: Claude --- vortex-row/public-api.lock | 76 ++++++++++ vortex-row/src/lib.rs | 3 + vortex-row/src/size.rs | 288 +++++++++++++++++++++++++++++++++++++ 3 files changed, 367 insertions(+) create mode 100644 vortex-row/src/size.rs diff --git a/vortex-row/public-api.lock b/vortex-row/public-api.lock index 1afc1f05442..85985bf7521 100644 --- a/vortex-row/public-api.lock +++ b/vortex-row/public-api.lock @@ -182,6 +182,46 @@ impl core::marker::StructuralPartialEq for vortex_row::options::SortField pub const vortex_row::options::FIELDS_INLINE: usize +pub mod vortex_row::size + +pub struct vortex_row::size::RowSize + +impl core::clone::Clone for vortex_row::size::RowSize + +pub fn vortex_row::size::RowSize::clone(&self) -> vortex_row::size::RowSize + +impl core::fmt::Debug for vortex_row::size::RowSize + +pub fn vortex_row::size::RowSize::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl vortex_array::scalar_fn::vtable::ScalarFnVTable for vortex_row::size::RowSize + +pub type vortex_row::size::RowSize::Options = vortex_row::options::RowEncodeOptions + +pub fn vortex_row::size::RowSize::arity(&self, &Self::Options) -> vortex_array::scalar_fn::vtable::Arity + +pub fn vortex_row::size::RowSize::child_name(&self, &Self::Options, usize) -> vortex_array::scalar_fn::vtable::ChildName + +pub fn vortex_row::size::RowSize::deserialize(&self, &[u8], &vortex_session::VortexSession) -> vortex_error::VortexResult + +pub fn vortex_row::size::RowSize::execute(&self, &Self::Options, &dyn vortex_array::scalar_fn::vtable::ExecutionArgs, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + +pub fn vortex_row::size::RowSize::id(&self) -> vortex_array::scalar_fn::ScalarFnId + +pub fn vortex_row::size::RowSize::is_fallible(&self, &Self::Options) -> bool + +pub fn vortex_row::size::RowSize::is_null_sensitive(&self, &Self::Options) -> bool + +pub fn vortex_row::size::RowSize::return_dtype(&self, &Self::Options, &[vortex_array::dtype::DType]) -> vortex_error::VortexResult + +pub fn vortex_row::size::RowSize::serialize(&self, &Self::Options) -> vortex_error::VortexResult>> + +pub trait vortex_row::size::RowSizeKernel: vortex_array::array::vtable::VTable + +pub fn vortex_row::size::RowSizeKernel::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + +pub fn vortex_row::size::dispatch_size(&vortex_array::array::erased::ArrayRef, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()> + pub struct vortex_row::RowEncodeOptions pub vortex_row::RowEncodeOptions::fields: smallvec::SmallVec<[vortex_row::options::SortField; 4]> @@ -214,6 +254,38 @@ pub fn vortex_row::options::RowEncodeOptions::hash<__H: core::hash::Hasher>(&sel impl core::marker::StructuralPartialEq for vortex_row::options::RowEncodeOptions +pub struct vortex_row::RowSize + +impl core::clone::Clone for vortex_row::size::RowSize + +pub fn vortex_row::size::RowSize::clone(&self) -> vortex_row::size::RowSize + +impl core::fmt::Debug for vortex_row::size::RowSize + +pub fn vortex_row::size::RowSize::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl vortex_array::scalar_fn::vtable::ScalarFnVTable for vortex_row::size::RowSize + +pub type vortex_row::size::RowSize::Options = vortex_row::options::RowEncodeOptions + +pub fn vortex_row::size::RowSize::arity(&self, &Self::Options) -> vortex_array::scalar_fn::vtable::Arity + +pub fn vortex_row::size::RowSize::child_name(&self, &Self::Options, usize) -> vortex_array::scalar_fn::vtable::ChildName + +pub fn vortex_row::size::RowSize::deserialize(&self, &[u8], &vortex_session::VortexSession) -> vortex_error::VortexResult + +pub fn vortex_row::size::RowSize::execute(&self, &Self::Options, &dyn vortex_array::scalar_fn::vtable::ExecutionArgs, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + +pub fn vortex_row::size::RowSize::id(&self) -> vortex_array::scalar_fn::ScalarFnId + +pub fn vortex_row::size::RowSize::is_fallible(&self, &Self::Options) -> bool + +pub fn vortex_row::size::RowSize::is_null_sensitive(&self, &Self::Options) -> bool + +pub fn vortex_row::size::RowSize::return_dtype(&self, &Self::Options, &[vortex_array::dtype::DType]) -> vortex_error::VortexResult + +pub fn vortex_row::size::RowSize::serialize(&self, &Self::Options) -> vortex_error::VortexResult>> + pub struct vortex_row::SortField pub vortex_row::SortField::descending: bool @@ -258,4 +330,8 @@ impl core::marker::Copy for vortex_row::options::SortField impl core::marker::StructuralPartialEq for vortex_row::options::SortField +pub trait vortex_row::RowSizeKernel: vortex_array::array::vtable::VTable + +pub fn vortex_row::RowSizeKernel::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + pub fn vortex_row::initialize(&vortex_session::VortexSession) diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs index bdac4c8f48e..6f1d8fbeab3 100644 --- a/vortex-row/src/lib.rs +++ b/vortex-row/src/lib.rs @@ -8,9 +8,12 @@ pub mod codec; pub mod options; +pub mod size; pub use options::RowEncodeOptions; pub use options::SortField; +pub use size::RowSize; +pub use size::RowSizeKernel; use vortex_session::VortexSession; /// Register the row-encoding scalar functions on the given session. diff --git a/vortex-row/src/size.rs b/vortex-row/src/size.rs new file mode 100644 index 00000000000..fbde52e1863 --- /dev/null +++ b/vortex-row/src/size.rs @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! `RowSize` variadic scalar function: aggregate per-row byte sizes for N input columns. + +use std::sync::Arc; + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::Canonical; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::VTable; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::StructArray; +use vortex_array::dtype::DType; +use vortex_array::dtype::FieldName; +use vortex_array::dtype::FieldNames; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::dtype::StructFields; +use vortex_array::scalar::Scalar; +use vortex_array::scalar_fn::Arity; +use vortex_array::scalar_fn::ChildName; +use vortex_array::scalar_fn::ExecutionArgs; +use vortex_array::scalar_fn::ScalarFnId; +use vortex_array::scalar_fn::ScalarFnVTable; +use vortex_array::validity::Validity; +use vortex_buffer::Buffer; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_session::VortexSession; + +use crate::codec; +use crate::codec::RowWidth; +use crate::options::RowEncodeOptions; +use crate::options::SortField; +use crate::options::deserialize_row_encode_options; +use crate::options::serialize_row_encode_options; + +/// Classification of a single input column for the size pass. +/// +/// Tracks each column's within-row byte offset (the constant prefix from all preceding +/// fixed-width columns) and, for fixed columns, whether any variable-length column has +/// appeared yet — the encode pass uses this to choose between the arithmetic-write fast +/// path (no varlen before this column, so the within-row position is constant) and the +/// cursor-write path. +#[derive(Clone, Copy, Debug)] +#[allow( + dead_code, + reason = "fields read by the RowEncode pipeline in a later commit" +)] +pub(crate) enum ColKind { + /// Column has fixed width `width`. `prefix` is the within-row byte offset of this + /// column's first byte. If `before_varlen` is true, no variable-length column precedes + /// this one, so the within-row offset is constant for every row. + Fixed { + width: u32, + prefix: u32, + before_varlen: bool, + }, + /// Column has variable per-row width. `fixed_prefix` is the sum of widths of all + /// preceding fixed columns; the varlen contribution from earlier varlen columns is + /// added per row. + Variable { fixed_prefix: u32 }, +} + +/// Result of the size pass: enough information for both [`RowSize::execute`] and the +/// downstream [`RowEncode`](super::encode::RowEncode) pipeline. +pub(crate) struct SizePassResult { + pub fixed_per_row: u32, + pub var_lengths: Option>, + pub col_kinds: Vec, + pub first_varlen_idx: Option, + pub columns: Vec, +} + +/// Walk N input columns once, classifying each as fixed-width or variable-length and +/// accumulating per-row size contributions. +/// +/// Fixed-width columns contribute a single scalar increment to `fixed_per_row`; they do +/// not touch `var_lengths`. Variable-length columns add per-row contributions into the +/// lazily-allocated `var_lengths` vec via [`dispatch_size`]. +/// +/// This is shared by [`RowSize::execute`] (which wraps the result into a +/// `Struct { fixed, var }`) and the [`RowEncode`](super::encode::RowEncode) pipeline +/// (which uses the full result, including `col_kinds`, to drive the encode pass). +pub(crate) fn compute_sizes( + options: &RowEncodeOptions, + args: &dyn ExecutionArgs, + ctx: &mut ExecutionCtx, + op_name: &'static str, +) -> VortexResult { + let n_inputs = args.num_inputs(); + if n_inputs == 0 { + vortex_bail!("{} requires at least one input column", op_name); + } + if options.fields.len() != n_inputs { + vortex_bail!( + "{} options.fields.len()={} does not match num_inputs={}", + op_name, + options.fields.len(), + n_inputs + ); + } + let nrows = args.row_count(); + + let mut columns: Vec = Vec::with_capacity(n_inputs); + let mut col_kinds: Vec = Vec::with_capacity(n_inputs); + let mut fixed_per_row: u32 = 0; + let mut var_lengths: Option> = None; + let mut first_varlen_idx: Option = None; + let mut running_fixed_prefix: u32 = 0; + + for i in 0..n_inputs { + let col = args.get(i)?; + if col.len() != nrows { + vortex_bail!( + "{}: column {} has length {} but expected {}", + op_name, + i, + col.len(), + nrows + ); + } + match codec::row_width_for_dtype(col.dtype())? { + RowWidth::Fixed(w) => { + col_kinds.push(ColKind::Fixed { + width: w, + prefix: running_fixed_prefix, + before_varlen: first_varlen_idx.is_none(), + }); + fixed_per_row = fixed_per_row + .checked_add(w) + .vortex_expect("row width overflow"); + running_fixed_prefix = running_fixed_prefix + .checked_add(w) + .vortex_expect("row width overflow"); + } + RowWidth::Variable => { + if first_varlen_idx.is_none() { + first_varlen_idx = Some(i); + } + let v = var_lengths.get_or_insert_with(|| vec![0u32; nrows]); + dispatch_size(&col, options.fields[i], v, ctx)?; + col_kinds.push(ColKind::Variable { + fixed_prefix: running_fixed_prefix, + }); + } + } + columns.push(col); + } + + Ok(SizePassResult { + fixed_per_row, + var_lengths, + col_kinds, + first_varlen_idx, + columns, + }) +} + +/// Variadic scalar function that, given N input columns and per-column [`SortField`]s, +/// returns a `Struct { fixed: U32, var: U32 }` array of per-row byte sizes for the +/// row-oriented encoding produced by [`RowEncode`](super::encode::RowEncode). +/// +/// The `fixed` field is always a [`ConstantArray`] holding the sum of the per-column +/// constant widths of fixed-width inputs (sentinel + value bytes). The `var` field is a +/// `ConstantArray(0)` when there are no variable-length input columns, and a +/// [`PrimitiveArray`] of per-row varlen-byte sums otherwise. +/// +/// The total per-row byte size is `fixed + var`. +#[derive(Clone, Debug)] +pub struct RowSize; + +/// Returns the [`FieldNames`] used by the [`RowSize`] output struct. +pub(crate) fn row_size_field_names() -> FieldNames { + FieldNames::from([FieldName::from("fixed"), FieldName::from("var")]) +} + +/// Returns the output [`DType`] of [`RowSize`]. +pub(crate) fn row_size_struct_dtype() -> DType { + DType::Struct( + StructFields::new( + row_size_field_names(), + vec![ + DType::Primitive(PType::U32, Nullability::NonNullable), + DType::Primitive(PType::U32, Nullability::NonNullable), + ], + ), + Nullability::NonNullable, + ) +} + +impl ScalarFnVTable for RowSize { + type Options = RowEncodeOptions; + + fn id(&self) -> ScalarFnId { + ScalarFnId::from("vortex.row_size") + } + + fn serialize(&self, options: &Self::Options) -> VortexResult>> { + Ok(Some(serialize_row_encode_options(options))) + } + + fn deserialize( + &self, + metadata: &[u8], + _session: &VortexSession, + ) -> VortexResult { + deserialize_row_encode_options(metadata) + } + + fn arity(&self, _options: &Self::Options) -> Arity { + Arity::Variadic { min: 1, max: None } + } + + fn child_name(&self, _options: &Self::Options, child_idx: usize) -> ChildName { + ChildName::from(Arc::from(format!("col_{}", child_idx))) + } + + fn return_dtype(&self, _options: &Self::Options, _args: &[DType]) -> VortexResult { + Ok(row_size_struct_dtype()) + } + + fn execute( + &self, + options: &Self::Options, + args: &dyn ExecutionArgs, + ctx: &mut ExecutionCtx, + ) -> VortexResult { + let nrows = args.row_count(); + let result = compute_sizes(options, args, ctx, "RowSize")?; + let fixed_array = + ConstantArray::new(Scalar::from(result.fixed_per_row), nrows).into_array(); + let var_array = match result.var_lengths { + Some(v) => PrimitiveArray::new(Buffer::::copy_from(&v), Validity::NonNullable) + .into_array(), + None => ConstantArray::new(Scalar::from(0u32), nrows).into_array(), + }; + Ok(StructArray::try_new( + row_size_field_names(), + vec![fixed_array, var_array], + nrows, + Validity::NonNullable, + )? + .into_array()) + } + + fn is_null_sensitive(&self, _options: &Self::Options) -> bool { + true + } + + fn is_fallible(&self, _options: &Self::Options) -> bool { + false + } +} + +/// Dispatch a single column's per-row size contribution. +/// +/// For PR 1 this is just the canonicalize-then-`codec::field_size` fallback path. In-crate +/// fast paths for `Constant`/`Dict`/`Patched` and the inventory-based registry for +/// downstream encodings are added in PR 3. +pub fn dispatch_size( + col: &ArrayRef, + field: SortField, + sizes: &mut [u32], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let canonical = col.clone().execute::(ctx)?; + codec::field_size(&canonical, field, sizes, ctx) +} + +/// Mutate-buffer kernel: add this column's per-row byte contribution into the shared +/// `sizes` slice. Return `Ok(None)` to decline and fall back to the canonical path. +/// +/// Trait is defined now; per-encoding impls and dispatch wiring land in PR 3. +pub trait RowSizeKernel: VTable { + /// Add this column's per-row byte contribution into `sizes`. + fn row_size_contribution( + column: ArrayView<'_, Self>, + field: SortField, + sizes: &mut [u32], + ctx: &mut ExecutionCtx, + ) -> VortexResult>; +} From 40783a62bcfdc5b5d92782144c101cd577801d11 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 22:11:06 +0000 Subject: [PATCH 07/10] vortex-row: RowEncode ScalarFn Add the RowEncode variadic scalar function: encode N input columns into a single ListView in a five-phase pipeline. Phase 1: size pass via `compute_sizes`. Phase 2: allocate a zero-initialized output buffer sized to fit every row's encoded bytes; bail if the total exceeds u32::MAX. Phase 3: build per-row `listview_offsets`: i * fixed_per_row for the pure-fixed case, or i * fixed_per_row + exclusive cumsum of varlen lengths otherwise. Uses the simple `Vec::push` + `checked_add` loop. Phase 4: walk columns left-to-right and call `dispatch_encode` for every column (cursor path for all). Each call writes its per-row bytes at `offsets[i] + cursors[i]` and advances the cursor. Phase 5: build the ListView via the validating `try_new` constructor. `dispatch_encode` is the canonicalize-then-`codec::field_encode` fallback; in-crate kernel arms and the inventory registry land in PR 3. The `RowEncodeKernel` trait is defined but unused. PR 2 will iterate on this pipeline (skip zero-init, skip ListView validation, auto- vectorize the offsets loop, etc.). Signed-off-by: Claude --- vortex-row/public-api.lock | 76 ++++++++++++ vortex-row/src/encode.rs | 238 +++++++++++++++++++++++++++++++++++++ vortex-row/src/lib.rs | 3 + vortex-row/src/size.rs | 8 ++ 4 files changed, 325 insertions(+) create mode 100644 vortex-row/src/encode.rs diff --git a/vortex-row/public-api.lock b/vortex-row/public-api.lock index 85985bf7521..f999303948d 100644 --- a/vortex-row/public-api.lock +++ b/vortex-row/public-api.lock @@ -102,6 +102,46 @@ pub fn vortex_row::codec::field_size(&vortex_array::canonical::Canonical, vortex pub fn vortex_row::codec::row_width_for_dtype(&vortex_array::dtype::DType) -> vortex_error::VortexResult +pub mod vortex_row::encode + +pub struct vortex_row::encode::RowEncode + +impl core::clone::Clone for vortex_row::encode::RowEncode + +pub fn vortex_row::encode::RowEncode::clone(&self) -> vortex_row::encode::RowEncode + +impl core::fmt::Debug for vortex_row::encode::RowEncode + +pub fn vortex_row::encode::RowEncode::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl vortex_array::scalar_fn::vtable::ScalarFnVTable for vortex_row::encode::RowEncode + +pub type vortex_row::encode::RowEncode::Options = vortex_row::options::RowEncodeOptions + +pub fn vortex_row::encode::RowEncode::arity(&self, &Self::Options) -> vortex_array::scalar_fn::vtable::Arity + +pub fn vortex_row::encode::RowEncode::child_name(&self, &Self::Options, usize) -> vortex_array::scalar_fn::vtable::ChildName + +pub fn vortex_row::encode::RowEncode::deserialize(&self, &[u8], &vortex_session::VortexSession) -> vortex_error::VortexResult + +pub fn vortex_row::encode::RowEncode::execute(&self, &Self::Options, &dyn vortex_array::scalar_fn::vtable::ExecutionArgs, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + +pub fn vortex_row::encode::RowEncode::id(&self) -> vortex_array::scalar_fn::ScalarFnId + +pub fn vortex_row::encode::RowEncode::is_fallible(&self, &Self::Options) -> bool + +pub fn vortex_row::encode::RowEncode::is_null_sensitive(&self, &Self::Options) -> bool + +pub fn vortex_row::encode::RowEncode::return_dtype(&self, &Self::Options, &[vortex_array::dtype::DType]) -> vortex_error::VortexResult + +pub fn vortex_row::encode::RowEncode::serialize(&self, &Self::Options) -> vortex_error::VortexResult>> + +pub trait vortex_row::encode::RowEncodeKernel: vortex_array::array::vtable::VTable + +pub fn vortex_row::encode::RowEncodeKernel::row_encode_into(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + +pub fn vortex_row::encode::dispatch_encode(&vortex_array::array::erased::ArrayRef, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()> + pub mod vortex_row::options pub struct vortex_row::options::RowEncodeOptions @@ -222,6 +262,38 @@ pub fn vortex_row::size::RowSizeKernel::row_size_contribution(vortex_array::arra pub fn vortex_row::size::dispatch_size(&vortex_array::array::erased::ArrayRef, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()> +pub struct vortex_row::RowEncode + +impl core::clone::Clone for vortex_row::encode::RowEncode + +pub fn vortex_row::encode::RowEncode::clone(&self) -> vortex_row::encode::RowEncode + +impl core::fmt::Debug for vortex_row::encode::RowEncode + +pub fn vortex_row::encode::RowEncode::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl vortex_array::scalar_fn::vtable::ScalarFnVTable for vortex_row::encode::RowEncode + +pub type vortex_row::encode::RowEncode::Options = vortex_row::options::RowEncodeOptions + +pub fn vortex_row::encode::RowEncode::arity(&self, &Self::Options) -> vortex_array::scalar_fn::vtable::Arity + +pub fn vortex_row::encode::RowEncode::child_name(&self, &Self::Options, usize) -> vortex_array::scalar_fn::vtable::ChildName + +pub fn vortex_row::encode::RowEncode::deserialize(&self, &[u8], &vortex_session::VortexSession) -> vortex_error::VortexResult + +pub fn vortex_row::encode::RowEncode::execute(&self, &Self::Options, &dyn vortex_array::scalar_fn::vtable::ExecutionArgs, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + +pub fn vortex_row::encode::RowEncode::id(&self) -> vortex_array::scalar_fn::ScalarFnId + +pub fn vortex_row::encode::RowEncode::is_fallible(&self, &Self::Options) -> bool + +pub fn vortex_row::encode::RowEncode::is_null_sensitive(&self, &Self::Options) -> bool + +pub fn vortex_row::encode::RowEncode::return_dtype(&self, &Self::Options, &[vortex_array::dtype::DType]) -> vortex_error::VortexResult + +pub fn vortex_row::encode::RowEncode::serialize(&self, &Self::Options) -> vortex_error::VortexResult>> + pub struct vortex_row::RowEncodeOptions pub vortex_row::RowEncodeOptions::fields: smallvec::SmallVec<[vortex_row::options::SortField; 4]> @@ -330,6 +402,10 @@ impl core::marker::Copy for vortex_row::options::SortField impl core::marker::StructuralPartialEq for vortex_row::options::SortField +pub trait vortex_row::RowEncodeKernel: vortex_array::array::vtable::VTable + +pub fn vortex_row::RowEncodeKernel::row_encode_into(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + pub trait vortex_row::RowSizeKernel: vortex_array::array::vtable::VTable pub fn vortex_row::RowSizeKernel::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs new file mode 100644 index 00000000000..1b77d955964 --- /dev/null +++ b/vortex-row/src/encode.rs @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#![allow( + clippy::cast_possible_truncation, + reason = "row encoding indexes into u32-sized buffers; lengths are validated to fit in u32" +)] + +//! `RowEncode` variadic scalar function: encode N input columns into a single `ListView`. +//! +//! The output's `(elements, offsets, sizes)` triple is built up in a single left-to-right +//! pass over the input columns. The `sizes` array doubles as the per-row write cursor, so +//! when the last column finishes encoding, the accumulator is the final array - no separate +//! conversion step is needed. + +use std::sync::Arc; + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::Canonical; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::VTable; +use vortex_array::arrays::ListViewArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::scalar_fn::Arity; +use vortex_array::scalar_fn::ChildName; +use vortex_array::scalar_fn::ExecutionArgs; +use vortex_array::scalar_fn::ScalarFnId; +use vortex_array::scalar_fn::ScalarFnVTable; +use vortex_array::validity::Validity; +use vortex_buffer::Buffer; +use vortex_buffer::BufferMut; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_session::VortexSession; + +use crate::codec; +use crate::options::RowEncodeOptions; +use crate::options::SortField; +use crate::options::deserialize_row_encode_options; +use crate::options::serialize_row_encode_options; +use crate::size::compute_sizes; + +/// Variadic scalar function that encodes N input columns into a single `List` +/// [`ListViewArray`] where row `i` contains the row-encoded bytes for column values +/// `cols[0][i], cols[1][i], ...` concatenated left-to-right. +#[derive(Clone, Debug)] +pub struct RowEncode; + +impl ScalarFnVTable for RowEncode { + type Options = RowEncodeOptions; + + fn id(&self) -> ScalarFnId { + ScalarFnId::from("vortex.row_encode") + } + + fn serialize(&self, options: &Self::Options) -> VortexResult>> { + Ok(Some(serialize_row_encode_options(options))) + } + + fn deserialize( + &self, + metadata: &[u8], + _session: &VortexSession, + ) -> VortexResult { + deserialize_row_encode_options(metadata) + } + + fn arity(&self, _options: &Self::Options) -> Arity { + Arity::Variadic { min: 1, max: None } + } + + fn child_name(&self, _options: &Self::Options, child_idx: usize) -> ChildName { + ChildName::from(Arc::from(format!("col_{}", child_idx))) + } + + fn return_dtype(&self, _options: &Self::Options, _args: &[DType]) -> VortexResult { + Ok(DType::List( + Arc::new(DType::Primitive(PType::U8, Nullability::NonNullable)), + Nullability::NonNullable, + )) + } + + fn execute( + &self, + options: &Self::Options, + args: &dyn ExecutionArgs, + ctx: &mut ExecutionCtx, + ) -> VortexResult { + execute_row_encode(options, args, ctx) + } + + fn is_null_sensitive(&self, _options: &Self::Options) -> bool { + true + } + + fn is_fallible(&self, _options: &Self::Options) -> bool { + false + } +} + +fn execute_row_encode( + options: &RowEncodeOptions, + args: &dyn ExecutionArgs, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let nrows = args.row_count(); + + // ===== Phase 1: classify + size pass ===== + let crate::size::SizePassResult { + fixed_per_row, + var_lengths, + col_kinds: _, + first_varlen_idx: _, + columns, + } = compute_sizes(options, args, ctx, "RowEncode")?; + + // ===== Phase 2: totals + buffer ===== + let var_total: u64 = var_lengths + .as_ref() + .map_or(0, |v| v.iter().map(|&x| u64::from(x)).sum()); + let total: u64 = (nrows as u64) + .checked_mul(u64::from(fixed_per_row)) + .and_then(|t| t.checked_add(var_total)) + .vortex_expect("row-encoded total bytes overflow"); + if total > u32::MAX as u64 { + vortex_bail!("row-encoded output size {} bytes exceeds u32::MAX", total); + } + let total_len = total as usize; + + // Allocate the elements buffer (zero-initialized). The zero-init lets every encoder + // assume previously-untouched bytes are zero, simplifying the null-row fill paths. + // PR 2 skips this memset because every byte in the output range is written by some + // encoder. + let mut out_buf: BufferMut = BufferMut::with_capacity(total_len); + out_buf.push_n(0u8, total_len); + + // ===== Phase 3: per-row offsets ===== + // listview_offsets[i] is the absolute byte offset where row `i` begins. + // For pure-fixed: i * fixed_per_row. + // For mixed: i * fixed_per_row + exclusive prefix sum of var_lengths. + let mut listview_offsets: Vec = Vec::with_capacity(nrows); + match var_lengths.as_ref() { + None => { + for i in 0..nrows { + listview_offsets.push( + (i as u32) + .checked_mul(fixed_per_row) + .vortex_expect("row offset overflow (already validated total fits in u32)"), + ); + } + } + Some(v) => { + let mut acc: u32 = 0; + for (i, &l) in v.iter().enumerate() { + let off = (i as u32) + .checked_mul(fixed_per_row) + .and_then(|t| t.checked_add(acc)) + .vortex_expect("row offset overflow"); + listview_offsets.push(off); + acc = acc.checked_add(l).vortex_expect("varlen prefix overflow"); + } + } + } + + // Per-row write cursor (also doubles as the ListView `sizes` slot when done). + let mut row_cursors = vec![0u32; nrows]; + + // ===== Phase 4: encode columns via the cursor path ===== + for (i, col) in columns.iter().enumerate() { + dispatch_encode( + col, + options.fields[i], + &listview_offsets, + &mut row_cursors, + &mut out_buf, + ctx, + )?; + } + + // ===== Phase 5: build ListView output ===== + let elements = PrimitiveArray::new(out_buf.freeze(), Validity::NonNullable).into_array(); + let offsets_arr = PrimitiveArray::new( + Buffer::::copy_from(&listview_offsets), + Validity::NonNullable, + ) + .into_array(); + let sizes_arr = PrimitiveArray::new( + Buffer::::copy_from(&row_cursors), + Validity::NonNullable, + ) + .into_array(); + Ok( + ListViewArray::try_new(elements, offsets_arr, sizes_arr, Validity::NonNullable)? + .into_array(), + ) +} + +/// Dispatch a single column's encoding into the shared `out` buffer. +/// +/// For PR 1 this is just the canonicalize-then-`codec::field_encode` fallback path. +/// In-crate fast paths for `Constant`/`Dict`/`Patched` and the inventory-based registry +/// for downstream encodings are added in PR 3. +pub fn dispatch_encode( + col: &ArrayRef, + field: SortField, + offsets: &[u32], + cursors: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let canonical = col.clone().execute::(ctx)?; + codec::field_encode(&canonical, field, offsets, cursors, out, ctx) +} + +/// Mutate-buffer kernel: write this column's per-row bytes into `out` at +/// `offsets[i] + cursors[i]`, advancing `cursors[i]` by the bytes written. +/// +/// Return `Ok(None)` to decline and fall back to the canonical path. +/// +/// Trait is defined now; per-encoding impls and dispatch wiring land in PR 3. +pub trait RowEncodeKernel: VTable { + /// Write this column's per-row bytes into `out` at `offsets[i] + cursors[i]`, advancing + /// `cursors[i]` by the bytes written. + fn row_encode_into( + column: ArrayView<'_, Self>, + field: SortField, + offsets: &[u32], + cursors: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, + ) -> VortexResult>; +} diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs index 6f1d8fbeab3..ef0209f3d9c 100644 --- a/vortex-row/src/lib.rs +++ b/vortex-row/src/lib.rs @@ -7,9 +7,12 @@ //! This commit only establishes the crate skeleton and an `initialize` stub. pub mod codec; +pub mod encode; pub mod options; pub mod size; +pub use encode::RowEncode; +pub use encode::RowEncodeKernel; pub use options::RowEncodeOptions; pub use options::SortField; pub use size::RowSize; diff --git a/vortex-row/src/size.rs b/vortex-row/src/size.rs index fbde52e1863..7148a2a21d8 100644 --- a/vortex-row/src/size.rs +++ b/vortex-row/src/size.rs @@ -72,7 +72,15 @@ pub(crate) enum ColKind { pub(crate) struct SizePassResult { pub fixed_per_row: u32, pub var_lengths: Option>, + #[allow( + dead_code, + reason = "consumed by the arithmetic-write fast path added in PR 2" + )] pub col_kinds: Vec, + #[allow( + dead_code, + reason = "consumed by the arithmetic-write fast path added in PR 2" + )] pub first_varlen_idx: Option, pub columns: Vec, } From 87febfe4a71de78b52c0dd15917ca78390524889 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 22:16:19 +0000 Subject: [PATCH 08/10] vortex-row: convert_columns + tests + bench scaffolding Wire the RowSize/RowEncode scalar functions to the user-facing API: - `convert_columns` accepts a slice of input arrays and per-column SortFields, constructs `RowEncodeOptions` + `VecExecutionArgs`, and returns the encoded `ListViewArray`. - `compute_row_sizes` returns just the per-row sizes (the `Struct { fixed: u32, var: u32 }` output of `RowSize`). - `initialize()` now registers `RowSize` and `RowEncode` on the given session so they are reachable via the expression layer. Tests cover sort-order round-trips for bool, primitive (i64 asc/desc, u32, f64), utf8, multi-column, nulls_first/last, struct sort-order, the single-buffer invariant of the ListView output, and the structural shape of `RowSize`. Tests that exercise per-encoding fast paths (`constant_path_matches_canonical`, `dict_path_matches_canonical`) land together with their respective kernels in PR 3. The bench file uses divan + mimalloc and reports throughput in GB/s of encoded output bytes for primitive_i64, utf8, and struct_mixed. Each has an `arrow_row` baseline and a `vortex` measurement. Per-encoding fast-path scenarios (constant/dict/patched/bitpacked/for/delta) gain their triplets in PR 3. Baseline measurements at this commit (sample-count=10): primitive_i64_vortex ~1.97 GB/s (vs arrow-row 4.12 GB/s) utf8_vortex ~0.87 GB/s (vs arrow-row 1.56 GB/s) struct_mixed_vortex ~0.95 GB/s (vs arrow-row 1.19 GB/s) PR 2 closes most of the gap by replacing the validating `ListViewArray::try_new` with `new_unchecked`, skipping the buffer zero-init, auto-vectorizing the offsets and varlen-block paths, etc. Signed-off-by: Claude --- Cargo.lock | 7 + Cargo.toml | 3 +- vortex-row/Cargo.toml | 14 ++ vortex-row/benches/row_encode.rs | 177 +++++++++++++++++ vortex-row/public-api.lock | 10 + vortex-row/src/convert.rs | 75 +++++++ vortex-row/src/lib.rs | 41 +++- vortex-row/src/tests.rs | 324 +++++++++++++++++++++++++++++++ 8 files changed, 645 insertions(+), 6 deletions(-) create mode 100644 vortex-row/benches/row_encode.rs create mode 100644 vortex-row/src/convert.rs create mode 100644 vortex-row/src/tests.rs diff --git a/Cargo.lock b/Cargo.lock index 0b84f6dd260..23ca027d592 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11038,7 +11038,14 @@ dependencies = [ name = "vortex-row" version = "0.1.0" dependencies = [ + "arrow-array 58.2.0", + "arrow-row 58.2.0", + "arrow-schema 58.2.0", "bytes", + "codspeed-divan-compat", + "mimalloc", + "rand 0.10.1", + "rstest", "smallvec", "vortex-array", "vortex-buffer", diff --git a/Cargo.toml b/Cargo.toml index 9fae5b564bf..cdf28137563 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,12 +7,12 @@ members = [ "vortex-mask", "vortex-utils", "vortex-session", - "vortex-row", "vortex-flatbuffers", "vortex-metrics", "vortex-io", "vortex-proto", "vortex-array", + "vortex-row", "vortex-tensor", "vortex-turboquant", "vortex-compressor", @@ -103,6 +103,7 @@ arrow-cast = "58" arrow-data = "58" arrow-ipc = "58" arrow-ord = "58" +arrow-row = "58" arrow-schema = "58" arrow-select = "58" arrow-string = "58" diff --git a/vortex-row/Cargo.toml b/vortex-row/Cargo.toml index aaed9a55f51..50d6547474a 100644 --- a/vortex-row/Cargo.toml +++ b/vortex-row/Cargo.toml @@ -24,3 +24,17 @@ vortex-buffer = { workspace = true } vortex-error = { workspace = true } vortex-mask = { workspace = true } vortex-session = { workspace = true } + +[dev-dependencies] +arrow-array = { workspace = true } +arrow-row = { workspace = true } +arrow-schema = { workspace = true } +divan = { workspace = true } +mimalloc = { workspace = true } +rand = { workspace = true } +rstest = { workspace = true } +vortex-array = { workspace = true, features = ["_test-harness"] } + +[[bench]] +name = "row_encode" +harness = false diff --git a/vortex-row/benches/row_encode.rs b/vortex-row/benches/row_encode.rs new file mode 100644 index 00000000000..8d631d785da --- /dev/null +++ b/vortex-row/benches/row_encode.rs @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#![expect( + clippy::unwrap_used, + clippy::clone_on_ref_ptr, + clippy::cloned_ref_to_slice_refs, + clippy::cast_possible_truncation, + clippy::cast_possible_wrap, + clippy::redundant_clone +)] + +//! Row-encode throughput benchmarks comparing `arrow-row` against vortex's `convert_columns` +//! for the canonical scenarios shipped in PR 1: a primitive i64 column, a Utf8 column, +//! and a mixed-field struct. Per-encoding fast paths (Constant, Dict, Patched, BitPacked, +//! FoR, Delta) gain their own triplets in PR 3. + +use std::sync::Arc; + +use arrow_array::Int64Array; +use arrow_array::StringArray; +use arrow_array::StructArray as ArrowStructArray; +use arrow_row::RowConverter; +use arrow_row::SortField as ArrowSortField; +use arrow_schema::DataType; +use arrow_schema::Field; +use divan::counter::BytesCount; +use mimalloc::MiMalloc; +use rand::RngExt; +use rand::SeedableRng; +use rand::distr::Alphanumeric; +use rand::rngs::StdRng; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::StructArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_row::SortField; +use vortex_row::convert_columns; + +#[global_allocator] +static GLOBAL: MiMalloc = MiMalloc; + +const N: usize = 100_000; + +fn main() { + divan::main(); +} + +fn gen_i64(n: usize, seed: u64) -> Vec { + let mut rng = StdRng::seed_from_u64(seed); + (0..n) + .map(|_| rng.random_range(i64::MIN..i64::MAX)) + .collect() +} + +fn gen_words(n: usize, mean_len: usize, seed: u64) -> Vec { + let rng = &mut StdRng::seed_from_u64(seed); + (0..n) + .map(|_| { + let len = rng.random_range(mean_len.saturating_sub(4)..=mean_len + 4); + rng.sample_iter(&Alphanumeric) + .take(len) + .map(char::from) + .collect::() + }) + .collect() +} + +// ---------- primitive_i64 ---------- + +#[divan::bench] +fn primitive_i64_arrow_row(bencher: divan::Bencher) { + let v = gen_i64(N, 0); + let arr = Arc::new(Int64Array::from(v.clone())) as arrow_array::ArrayRef; + let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Int64)]).unwrap(); + let bytes = (N * (1 + 8)) as u64; + bencher + .counter(BytesCount::new(bytes)) + .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap()) +} + +#[divan::bench] +fn primitive_i64_vortex(bencher: divan::Bencher) { + let v = gen_i64(N, 0); + let col = PrimitiveArray::from_iter(v.clone()).into_array(); + let bytes = (N * (1 + 8)) as u64; + bencher.counter(BytesCount::new(bytes)).bench_local(|| { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + convert_columns(&[col.clone()], &[SortField::default()], &mut ctx).unwrap() + }) +} + +// ---------- utf8 ---------- + +#[divan::bench] +fn utf8_arrow_row(bencher: divan::Bencher) { + let words = gen_words(N, 16, 7); + let total: u64 = words + .iter() + .map(|w| 1 + (w.len().div_ceil(32) * 33) as u64) + .sum(); + let arr = Arc::new(StringArray::from(words.clone())) as arrow_array::ArrayRef; + let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Utf8)]).unwrap(); + bencher + .counter(BytesCount::new(total)) + .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap()) +} + +#[divan::bench] +fn utf8_vortex(bencher: divan::Bencher) { + let words = gen_words(N, 16, 7); + let total: u64 = words + .iter() + .map(|w| 1 + (w.len().div_ceil(32) * 33) as u64) + .sum(); + let col = VarBinViewArray::from_iter_str(words.iter().map(String::as_str)).into_array(); + bencher.counter(BytesCount::new(total)).bench_local(|| { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + convert_columns(&[col.clone()], &[SortField::default()], &mut ctx).unwrap() + }) +} + +// ---------- struct_mixed ---------- + +fn struct_mixed_inputs() -> (Vec, Vec, u64) { + let ids = gen_i64(N, 1); + let names = gen_words(N, 16, 2); + // sentinel (1) + i64 (1+8=9) + utf8-name (1 + ceil(len/32)*33) + let total: u64 = (0..N) + .map(|i| { + let name_bytes = 1 + (names[i].len().div_ceil(32) * 33) as u64; + 1u64 + 9u64 + name_bytes + }) + .sum(); + (ids, names, total) +} + +#[divan::bench] +fn struct_mixed_arrow_row(bencher: divan::Bencher) { + let (ids, names, total) = struct_mixed_inputs(); + let id_arr = Arc::new(Int64Array::from(ids)) as arrow_array::ArrayRef; + let name_arr = Arc::new(StringArray::from(names)) as arrow_array::ArrayRef; + let arrow_struct = Arc::new(ArrowStructArray::from(vec![ + (Arc::new(Field::new("id", DataType::Int64, false)), id_arr), + ( + Arc::new(Field::new("name", DataType::Utf8, false)), + name_arr, + ), + ])) as arrow_array::ArrayRef; + let struct_fields = vec![ + Arc::new(Field::new("id", DataType::Int64, false)), + Arc::new(Field::new("name", DataType::Utf8, false)), + ]; + let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Struct( + struct_fields.into(), + ))]) + .unwrap(); + bencher + .counter(BytesCount::new(total)) + .bench_local(|| conv.convert_columns(&[arrow_struct.clone()]).unwrap()) +} + +#[divan::bench] +fn struct_mixed_vortex(bencher: divan::Bencher) { + let (ids, names, total) = struct_mixed_inputs(); + let id_arr = PrimitiveArray::from_iter(ids).into_array(); + let name_arr = VarBinViewArray::from_iter_str(names.iter().map(String::as_str)).into_array(); + let struct_arr = StructArray::from_fields(&[("id", id_arr), ("name", name_arr)]) + .unwrap() + .into_array(); + bencher.counter(BytesCount::new(total)).bench_local(|| { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + convert_columns(&[struct_arr.clone()], &[SortField::default()], &mut ctx).unwrap() + }) +} diff --git a/vortex-row/public-api.lock b/vortex-row/public-api.lock index f999303948d..ed231a1e556 100644 --- a/vortex-row/public-api.lock +++ b/vortex-row/public-api.lock @@ -102,6 +102,12 @@ pub fn vortex_row::codec::field_size(&vortex_array::canonical::Canonical, vortex pub fn vortex_row::codec::row_width_for_dtype(&vortex_array::dtype::DType) -> vortex_error::VortexResult +pub mod vortex_row::convert + +pub fn vortex_row::convert::compute_row_sizes(&[vortex_array::array::erased::ArrayRef], &[vortex_row::options::SortField], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + +pub fn vortex_row::convert::convert_columns(&[vortex_array::array::erased::ArrayRef], &[vortex_row::options::SortField], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + pub mod vortex_row::encode pub struct vortex_row::encode::RowEncode @@ -410,4 +416,8 @@ pub trait vortex_row::RowSizeKernel: vortex_array::array::vtable::VTable pub fn vortex_row::RowSizeKernel::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> +pub fn vortex_row::compute_row_sizes(&[vortex_array::array::erased::ArrayRef], &[vortex_row::options::SortField], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + +pub fn vortex_row::convert_columns(&[vortex_array::array::erased::ArrayRef], &[vortex_row::options::SortField], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + pub fn vortex_row::initialize(&vortex_session::VortexSession) diff --git a/vortex-row/src/convert.rs b/vortex-row/src/convert.rs new file mode 100644 index 00000000000..c3b06d92748 --- /dev/null +++ b/vortex-row/src/convert.rs @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! User-facing entry point: turn N columnar arrays into one row-encoded `ListView`. + +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::arrays::ListViewArray; +use vortex_array::scalar_fn::ScalarFnVTable; +use vortex_array::scalar_fn::VecExecutionArgs; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; + +use crate::encode::RowEncode; +use crate::options::RowEncodeOptions; +use crate::options::SortField; +use crate::size::RowSize; + +/// Convert N columnar arrays into a single row-oriented [`ListViewArray`] of `u8` whose +/// bytes are lexicographically comparable in the same order as a tuple comparison of the +/// input values according to `fields`. +pub fn convert_columns( + cols: &[ArrayRef], + fields: &[SortField], + ctx: &mut ExecutionCtx, +) -> VortexResult { + if cols.len() != fields.len() { + vortex_bail!( + "convert_columns: cols.len() ({}) does not match fields.len() ({})", + cols.len(), + fields.len() + ); + } + if cols.is_empty() { + vortex_bail!("convert_columns: at least one column is required"); + } + let nrows = cols[0].len(); + for (i, col) in cols.iter().enumerate() { + if col.len() != nrows { + vortex_bail!( + "convert_columns: column {} has length {} but expected {}", + i, + col.len(), + nrows + ); + } + } + + let options = RowEncodeOptions::new(fields.iter().copied()); + let args = VecExecutionArgs::new(cols.to_vec(), nrows); + let result = RowEncode.execute(&options, &args, ctx)?; + result.execute::(ctx) +} + +/// Compute only the per-row sizes (in bytes) of the row-encoded form for N columns. +pub fn compute_row_sizes( + cols: &[ArrayRef], + fields: &[SortField], + ctx: &mut ExecutionCtx, +) -> VortexResult { + if cols.len() != fields.len() { + vortex_bail!( + "compute_row_sizes: cols.len() ({}) does not match fields.len() ({})", + cols.len(), + fields.len() + ); + } + if cols.is_empty() { + vortex_bail!("compute_row_sizes: at least one column is required"); + } + let nrows = cols[0].len(); + let options = RowEncodeOptions::new(fields.iter().copied()); + let args = VecExecutionArgs::new(cols.to_vec(), nrows); + RowSize.execute(&options, &args, ctx) +} diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs index ef0209f3d9c..fddcca665c1 100644 --- a/vortex-row/src/lib.rs +++ b/vortex-row/src/lib.rs @@ -3,23 +3,54 @@ //! Row-oriented byte encoder, analogous to Apache Arrow's `arrow-row` crate. //! -//! Subsequent commits add the encoder, decoder helpers, and per-encoding fast paths. -//! This commit only establishes the crate skeleton and an `initialize` stub. +//! The encoder converts N columnar arrays into a single `List` array where each row's +//! bytes are lexicographically comparable in the same order as a tuple comparison of the +//! original values. This is useful for sorting, hashing into row containers, and other +//! operations that benefit from a sort-friendly opaque byte representation of a multi-column +//! key. +//! +//! Two variadic scalar functions drive the implementation: +//! - [`RowSize`] computes per-row byte sizes across all N input columns. +//! - [`RowEncode`] writes the row-encoded bytes into a single `ListView` accumulator +//! in one left-to-right pass. +//! +//! Each scalar function exposes a per-encoding fast-path trait +//! ([`RowSizeKernel`] / [`RowEncodeKernel`]) for downstream encodings to plug into; PR 3 +//! adds in-crate impls for `Constant`, `Dict`, and `Patched` and an inventory-based +//! registry for external encodings. +//! +//! The user-facing entry point is [`convert_columns`]. +//! +//! Row-encoding scalar functions are not registered in the default +//! [`VortexSession`]. Call [`initialize`] on a session to make `RowSize` and `RowEncode` +//! available via the expression layer. pub mod codec; +pub mod convert; pub mod encode; pub mod options; pub mod size; +#[cfg(test)] +mod tests; + +pub use convert::compute_row_sizes; +pub use convert::convert_columns; pub use encode::RowEncode; pub use encode::RowEncodeKernel; pub use options::RowEncodeOptions; pub use options::SortField; pub use size::RowSize; pub use size::RowSizeKernel; +use vortex_array::scalar_fn::session::ScalarFnSessionExt; use vortex_session::VortexSession; -/// Register the row-encoding scalar functions on the given session. +/// Register the row-encoding scalar functions ([`RowSize`] and [`RowEncode`]) on the given +/// session. /// -/// Currently a stub: subsequent commits register `RowSize` and `RowEncode` here. -pub fn initialize(_session: &VortexSession) {} +/// Call once on session construction if you want row encoding available via the expression +/// layer or via [`convert_columns`]. +pub fn initialize(session: &VortexSession) { + session.scalar_fns().register(RowSize); + session.scalar_fns().register(RowEncode); +} diff --git a/vortex-row/src/tests.rs b/vortex-row/src/tests.rs new file mode 100644 index 00000000000..ff7d8fb274a --- /dev/null +++ b/vortex-row/src/tests.rs @@ -0,0 +1,324 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#![allow( + clippy::approx_constant, + clippy::cloned_ref_to_slice_refs, + clippy::redundant_clone, + reason = "tests value clarity over micro-optimization" +)] + +//! Tests for the row encoder. + +use rstest::rstest; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::ListViewArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::listview::ListViewArrayExt; +use vortex_error::VortexResult; + +use crate::SortField; +use crate::convert_columns; + +fn collect_row_bytes(array: &ListViewArray) -> Vec> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let nrows = array.len(); + (0..nrows) + .map(|i| { + let slice = array.list_elements_at(i).unwrap(); + let p = slice.execute::(&mut ctx).unwrap(); + p.as_slice::().to_vec() + }) + .collect() +} + +/// Encode each column independently, sort the resulting row bytes, and check the permutation +/// matches the natural sort order of `values`. +fn assert_sort_order_i64(values: Vec, descending: bool) -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let col = PrimitiveArray::from_iter(values.clone()).into_array(); + let field = SortField { + descending, + nulls_first: true, + }; + let encoded = convert_columns(&[col], &[field], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + + // Build expected permutation: sort values naturally then compare to bytes-sorted order. + let mut idx: Vec = (0..values.len()).collect(); + if descending { + idx.sort_by(|a, b| values[*b].cmp(&values[*a])); + } else { + idx.sort_by(|a, b| values[*a].cmp(&values[*b])); + } + let expected_order: Vec> = idx.iter().map(|&i| rows[i].clone()).collect(); + + let mut sorted = rows.clone(); + sorted.sort(); + assert_eq!( + sorted, expected_order, + "Row-encoded bytes do not match natural sort order" + ); + Ok(()) +} + +#[rstest] +#[case::ascending(false)] +#[case::descending(true)] +fn primitive_i64_roundtrip(#[case] descending: bool) -> VortexResult<()> { + let values: Vec = vec![-5, 0, 5, i64::MIN, i64::MAX, 7, -7, 1]; + assert_sort_order_i64(values, descending) +} + +#[test] +fn primitive_u32_sort_order() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let values: Vec = vec![0, 1, 100, u32::MAX, 42, 17]; + let col = PrimitiveArray::from_iter(values.clone()).into_array(); + let encoded = convert_columns(&[col], &[SortField::default()], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + + let mut sorted_rows = rows.clone(); + sorted_rows.sort(); + + let mut sorted_idx: Vec = (0..values.len()).collect(); + sorted_idx.sort_by(|a, b| values[*a].cmp(&values[*b])); + let expected: Vec> = sorted_idx.iter().map(|&i| rows[i].clone()).collect(); + assert_eq!(sorted_rows, expected); + Ok(()) +} + +#[test] +fn primitive_f64_sort_order() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + // We use IEEE total-ordering semantics: -0.0 < +0.0 in the byte encoding (matches + // `arrow-row`). Avoid -0.0 in the natural-order baseline since partial_cmp says + // -0.0 == 0.0. + let values: Vec = vec![-1.5, 0.0, 1.5, f64::INFINITY, f64::NEG_INFINITY, 3.14]; + let col = PrimitiveArray::from_iter(values.clone()).into_array(); + let encoded = convert_columns(&[col], &[SortField::default()], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + + let mut sorted_rows = rows.clone(); + sorted_rows.sort(); + + let mut sorted_idx: Vec = (0..values.len()).collect(); + sorted_idx.sort_by(|a, b| values[*a].partial_cmp(&values[*b]).unwrap()); + let expected: Vec> = sorted_idx.iter().map(|&i| rows[i].clone()).collect(); + assert_eq!(sorted_rows, expected); + Ok(()) +} + +#[test] +fn bool_sort_order() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let col = BoolArray::from_iter([true, false, true, false]).into_array(); + let encoded = convert_columns(&[col], &[SortField::default()], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + + let mut sorted = rows.clone(); + sorted.sort(); + // false rows come first (2x), true rows after (2x) + assert_eq!(sorted[0], rows[1]); + assert_eq!(sorted[1], rows[3]); + assert_eq!(sorted[2], rows[0]); + assert_eq!(sorted[3], rows[2]); + Ok(()) +} + +#[test] +fn utf8_sort_order() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let values = vec![ + "banana", + "apple", + "", + "cherry", + "ban", + "banana_loaf_for_test", + ]; + let col = VarBinViewArray::from_iter_str(values.clone()).into_array(); + let encoded = convert_columns(&[col], &[SortField::default()], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + + let mut sorted = rows.clone(); + sorted.sort(); + + let mut sorted_idx: Vec = (0..values.len()).collect(); + sorted_idx.sort_by(|a, b| values[*a].cmp(values[*b])); + let expected: Vec> = sorted_idx.iter().map(|&i| rows[i].clone()).collect(); + assert_eq!(sorted, expected); + Ok(()) +} + +#[test] +fn multi_column_sort() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let ints: Vec = vec![1, 2, 1, 2, 1, 3]; + let strs = vec!["b", "a", "a", "b", "c", "z"]; + let col0 = PrimitiveArray::from_iter(ints.clone()).into_array(); + let col1 = VarBinViewArray::from_iter_str(strs.clone()).into_array(); + let encoded = convert_columns( + &[col0, col1], + &[SortField::default(), SortField::default()], + &mut ctx, + )?; + let rows = collect_row_bytes(&encoded); + + let mut sorted = rows.clone(); + sorted.sort(); + let mut idx: Vec = (0..ints.len()).collect(); + idx.sort_by(|a, b| ints[*a].cmp(&ints[*b]).then_with(|| strs[*a].cmp(strs[*b]))); + let expected: Vec> = idx.iter().map(|&i| rows[i].clone()).collect(); + assert_eq!(sorted, expected); + Ok(()) +} + +#[test] +fn nulls_first_and_last() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let values: Vec> = vec![Some(5), None, Some(1), None, Some(3)]; + let col = PrimitiveArray::from_option_iter(values.clone()).into_array(); + + // nulls_first=true + let encoded = convert_columns( + &[col.clone()], + &[SortField { + descending: false, + nulls_first: true, + }], + &mut ctx, + )?; + let rows = collect_row_bytes(&encoded); + let mut sorted = rows.clone(); + sorted.sort(); + // The first two sorted entries should be nulls + let null_count = values.iter().filter(|v| v.is_none()).count(); + for i in 0..null_count { + // a null encoded row begins with 0x00 + assert_eq!(sorted[i][0], 0x00); + } + // nulls_first=false + let encoded = convert_columns( + &[col], + &[SortField { + descending: false, + nulls_first: false, + }], + &mut ctx, + )?; + let rows = collect_row_bytes(&encoded); + let mut sorted = rows.clone(); + sorted.sort(); + // The last two sorted entries should be nulls + for i in 0..null_count { + let pos = sorted.len() - 1 - i; + assert_eq!(sorted[pos][0], 0x02); + } + Ok(()) +} + +#[test] +fn struct_sort_order() -> VortexResult<()> { + use vortex_array::arrays::StructArray; + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let ids: Vec = vec![3, 1, 3, 1, 2]; + let names = vec!["b", "a", "a", "b", "z"]; + let id_arr = PrimitiveArray::from_iter(ids.clone()).into_array(); + let name_arr = VarBinViewArray::from_iter_str(names.clone()).into_array(); + let struct_arr = StructArray::from_fields(&[("id", id_arr), ("name", name_arr)])?.into_array(); + + let encoded = convert_columns(&[struct_arr], &[SortField::default()], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + + let mut sorted = rows.clone(); + sorted.sort(); + let mut idx: Vec = (0..ids.len()).collect(); + idx.sort_by(|a, b| ids[*a].cmp(&ids[*b]).then_with(|| names[*a].cmp(names[*b]))); + let expected: Vec> = idx.iter().map(|&i| rows[i].clone()).collect(); + assert_eq!(sorted, expected); + Ok(()) +} + +#[test] +fn row_size_struct_shape() -> VortexResult<()> { + use vortex_array::arrays::Constant; + use vortex_array::arrays::StructArray; + use vortex_array::arrays::struct_::StructArrayExt; + + use crate::compute_row_sizes; + + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let ints: Vec = vec![1, 2, 3, 4, 5]; + let strs = vec!["a", "bb", "ccc", "", "eeeee"]; + let col0 = PrimitiveArray::from_iter(ints).into_array(); + let col1 = VarBinViewArray::from_iter_str(strs).into_array(); + + let sizes = compute_row_sizes( + &[col0, col1], + &[SortField::default(), SortField::default()], + &mut ctx, + )?; + // Shape must be Struct { fixed, var } + let struct_arr = sizes.execute::(&mut ctx)?; + assert_eq!(struct_arr.struct_fields().nfields(), 2); + let fixed = struct_arr.unmasked_field(0); + let var = struct_arr.unmasked_field(1); + + // `fixed` must be ConstantArray with value = encoded i32 width = 1 + 4 = 5. + let fixed_const = fixed + .as_opt::() + .expect("fixed field should be a ConstantArray"); + assert_eq!( + fixed_const.scalar(), + &vortex_array::scalar::Scalar::from(5u32), + "fixed scalar should be encoded primitive i32 width" + ); + + // `var` must be a PrimitiveArray, since we have a varlen column. + let var_prim = var.clone().execute::(&mut ctx)?; + let v: &[u32] = var_prim.as_slice(); + assert_eq!(v.len(), 5); + // empty string: sentinel(1) + 1 byte; non-empty: sentinel(1) + 33 bytes (single block). + let expected: Vec = vec![34, 34, 34, 2, 34]; + assert_eq!(v, expected.as_slice()); + Ok(()) +} + +#[test] +fn single_buffer_invariant() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + // Encoded rows here are all > 12 bytes, forcing the Ref-view path that points back into + // the shared data buffer. + let nrows = 64usize; + let primitives: Vec = (0..nrows as i64).collect(); + let strings: Vec = (0..nrows) + .map(|i| format!("row_{}_with_padding", i)) + .collect(); + let col0 = PrimitiveArray::from_iter(primitives.clone()).into_array(); + let col1 = VarBinViewArray::from_iter_str(strings.iter().map(String::as_str)).into_array(); + let encoded = convert_columns( + &[col0, col1], + &[SortField::default(), SortField::default()], + &mut ctx, + )?; + + let rows = collect_row_bytes(&encoded); + let expected_total: usize = rows.iter().map(|r| r.len()).sum(); + + // The shared data buffer holds the contiguous concatenation of every row's encoded bytes; + // per-row allocations would produce many small buffers instead of one shared buffer. + // ListView's elements array is a single contiguous primitive (u8) array; its length + // equals the sum of all per-row sizes. A per-row allocation strategy would instead + // produce N separate elements arrays or a sparse one. + let elements_len = encoded.elements().len(); + assert_eq!( + elements_len, expected_total, + "elements buffer size mismatch" + ); + Ok(()) +} From 74d89f1f07cc5074832d12e7b17549979c529197 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 22 May 2026 12:49:36 +0100 Subject: [PATCH 09/10] t Signed-off-by: Joe Isaacs --- vortex-row/benches/row_encode.rs | 21 +- vortex-row/public-api.lock | 420 ++++++++----------------------- vortex-row/src/codec.rs | 345 ++++--------------------- vortex-row/src/convert.rs | 75 ------ vortex-row/src/encode.rs | 76 ++---- vortex-row/src/lib.rs | 68 ++--- vortex-row/src/options.rs | 140 +++++++---- vortex-row/src/size.rs | 122 ++------- vortex-row/src/tests.rs | 116 ++++++--- 9 files changed, 418 insertions(+), 965 deletions(-) delete mode 100644 vortex-row/src/convert.rs diff --git a/vortex-row/benches/row_encode.rs b/vortex-row/benches/row_encode.rs index 8d631d785da..07493d6ad48 100644 --- a/vortex-row/benches/row_encode.rs +++ b/vortex-row/benches/row_encode.rs @@ -5,15 +5,12 @@ clippy::unwrap_used, clippy::clone_on_ref_ptr, clippy::cloned_ref_to_slice_refs, - clippy::cast_possible_truncation, - clippy::cast_possible_wrap, clippy::redundant_clone )] -//! Row-encode throughput benchmarks comparing `arrow-row` against vortex's `convert_columns` -//! for the canonical scenarios shipped in PR 1: a primitive i64 column, a Utf8 column, -//! and a mixed-field struct. Per-encoding fast paths (Constant, Dict, Patched, BitPacked, -//! FoR, Delta) gain their own triplets in PR 3. +//! Row-encode throughput benchmarks comparing `arrow-row` against Vortex's [`RowEncoder`] +//! for the core canonical scenarios: a primitive i64 column, a Utf8 column, and a +//! mixed-field struct. use std::sync::Arc; @@ -36,8 +33,7 @@ use vortex_array::VortexSessionExecute; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::StructArray; use vortex_array::arrays::VarBinViewArray; -use vortex_row::SortField; -use vortex_row::convert_columns; +use vortex_row::RowEncoder; #[global_allocator] static GLOBAL: MiMalloc = MiMalloc; @@ -86,9 +82,10 @@ fn primitive_i64_vortex(bencher: divan::Bencher) { let v = gen_i64(N, 0); let col = PrimitiveArray::from_iter(v.clone()).into_array(); let bytes = (N * (1 + 8)) as u64; + let encoder = RowEncoder::default(); bencher.counter(BytesCount::new(bytes)).bench_local(|| { let mut ctx = LEGACY_SESSION.create_execution_ctx(); - convert_columns(&[col.clone()], &[SortField::default()], &mut ctx).unwrap() + encoder.encode(&[col.clone()], &mut ctx).unwrap() }) } @@ -116,9 +113,10 @@ fn utf8_vortex(bencher: divan::Bencher) { .map(|w| 1 + (w.len().div_ceil(32) * 33) as u64) .sum(); let col = VarBinViewArray::from_iter_str(words.iter().map(String::as_str)).into_array(); + let encoder = RowEncoder::default(); bencher.counter(BytesCount::new(total)).bench_local(|| { let mut ctx = LEGACY_SESSION.create_execution_ctx(); - convert_columns(&[col.clone()], &[SortField::default()], &mut ctx).unwrap() + encoder.encode(&[col.clone()], &mut ctx).unwrap() }) } @@ -170,8 +168,9 @@ fn struct_mixed_vortex(bencher: divan::Bencher) { let struct_arr = StructArray::from_fields(&[("id", id_arr), ("name", name_arr)]) .unwrap() .into_array(); + let encoder = RowEncoder::default(); bencher.counter(BytesCount::new(total)).bench_local(|| { let mut ctx = LEGACY_SESSION.create_execution_ctx(); - convert_columns(&[struct_arr.clone()], &[SortField::default()], &mut ctx).unwrap() + encoder.encode(&[struct_arr.clone()], &mut ctx).unwrap() }) } diff --git a/vortex-row/public-api.lock b/vortex-row/public-api.lock index ed231a1e556..83c40788349 100644 --- a/vortex-row/public-api.lock +++ b/vortex-row/public-api.lock @@ -1,423 +1,207 @@ pub mod vortex_row -pub mod vortex_row::codec - -pub enum vortex_row::codec::RowWidth - -pub vortex_row::codec::RowWidth::Fixed(u32) - -pub vortex_row::codec::RowWidth::Variable - -impl core::clone::Clone for vortex_row::codec::RowWidth - -pub fn vortex_row::codec::RowWidth::clone(&self) -> vortex_row::codec::RowWidth - -impl core::cmp::Eq for vortex_row::codec::RowWidth - -impl core::cmp::PartialEq for vortex_row::codec::RowWidth - -pub fn vortex_row::codec::RowWidth::eq(&self, &vortex_row::codec::RowWidth) -> bool - -impl core::fmt::Debug for vortex_row::codec::RowWidth - -pub fn vortex_row::codec::RowWidth::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result - -impl core::marker::Copy for vortex_row::codec::RowWidth - -impl core::marker::StructuralPartialEq for vortex_row::codec::RowWidth - -pub const vortex_row::codec::BOOL_ENCODED_SIZE: u32 - -pub const vortex_row::codec::VARLEN_BLOCK_SIZE: usize - -pub const vortex_row::codec::VARLEN_BLOCK_TOTAL: usize - -pub trait vortex_row::codec::RowEncode: core::marker::Copy - -pub fn vortex_row::codec::RowEncode::encode_to(self, &mut [u8], bool) - -impl vortex_row::codec::RowEncode for f32 - -pub fn f32::encode_to(self, &mut [u8], bool) - -impl vortex_row::codec::RowEncode for f64 - -pub fn f64::encode_to(self, &mut [u8], bool) - -impl vortex_row::codec::RowEncode for half::binary16::f16 - -pub fn half::binary16::f16::encode_to(self, &mut [u8], bool) - -impl vortex_row::codec::RowEncode for i128 - -pub fn i128::encode_to(self, &mut [u8], bool) - -impl vortex_row::codec::RowEncode for i16 - -pub fn i16::encode_to(self, &mut [u8], bool) - -impl vortex_row::codec::RowEncode for i32 - -pub fn i32::encode_to(self, &mut [u8], bool) - -impl vortex_row::codec::RowEncode for i64 - -pub fn i64::encode_to(self, &mut [u8], bool) - -impl vortex_row::codec::RowEncode for i8 - -pub fn i8::encode_to(self, &mut [u8], bool) - -impl vortex_row::codec::RowEncode for u16 - -pub fn u16::encode_to(self, &mut [u8], bool) - -impl vortex_row::codec::RowEncode for u32 - -pub fn u32::encode_to(self, &mut [u8], bool) - -impl vortex_row::codec::RowEncode for u64 - -pub fn u64::encode_to(self, &mut [u8], bool) - -impl vortex_row::codec::RowEncode for u8 - -pub fn u8::encode_to(self, &mut [u8], bool) - -pub fn vortex_row::codec::encode_scalar(&vortex_array::scalar::Scalar, vortex_row::options::SortField) -> vortex_error::VortexResult - -pub fn vortex_row::codec::encode_scalar_bool(core::option::Option, vortex_row::options::SortField, &mut vortex_buffer::ByteBufferMut) - -pub fn vortex_row::codec::encode_scalar_null(vortex_row::options::SortField, bool, &mut vortex_buffer::ByteBufferMut) - -pub fn vortex_row::codec::encode_scalar_primitive(vortex_array::dtype::ptype::PType, vortex_array::scalar::typed_view::primitive::pvalue::PValue, vortex_row::options::SortField, bool, &mut vortex_buffer::ByteBufferMut) -> vortex_error::VortexResult<()> - -pub fn vortex_row::codec::encode_scalar_varlen(core::option::Option<&[u8]>, vortex_row::options::SortField, &mut vortex_buffer::ByteBufferMut) - -pub fn vortex_row::codec::encoded_size_for_scalar(&vortex_array::scalar::Scalar, vortex_row::options::SortField) -> vortex_error::VortexResult - -pub fn vortex_row::codec::field_encode(&vortex_array::canonical::Canonical, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()> - -pub fn vortex_row::codec::field_size(&vortex_array::canonical::Canonical, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()> - -pub fn vortex_row::codec::row_width_for_dtype(&vortex_array::dtype::DType) -> vortex_error::VortexResult - -pub mod vortex_row::convert - -pub fn vortex_row::convert::compute_row_sizes(&[vortex_array::array::erased::ArrayRef], &[vortex_row::options::SortField], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult - -pub fn vortex_row::convert::convert_columns(&[vortex_array::array::erased::ArrayRef], &[vortex_row::options::SortField], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult - -pub mod vortex_row::encode - -pub struct vortex_row::encode::RowEncode - -impl core::clone::Clone for vortex_row::encode::RowEncode - -pub fn vortex_row::encode::RowEncode::clone(&self) -> vortex_row::encode::RowEncode - -impl core::fmt::Debug for vortex_row::encode::RowEncode - -pub fn vortex_row::encode::RowEncode::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result - -impl vortex_array::scalar_fn::vtable::ScalarFnVTable for vortex_row::encode::RowEncode - -pub type vortex_row::encode::RowEncode::Options = vortex_row::options::RowEncodeOptions - -pub fn vortex_row::encode::RowEncode::arity(&self, &Self::Options) -> vortex_array::scalar_fn::vtable::Arity - -pub fn vortex_row::encode::RowEncode::child_name(&self, &Self::Options, usize) -> vortex_array::scalar_fn::vtable::ChildName - -pub fn vortex_row::encode::RowEncode::deserialize(&self, &[u8], &vortex_session::VortexSession) -> vortex_error::VortexResult - -pub fn vortex_row::encode::RowEncode::execute(&self, &Self::Options, &dyn vortex_array::scalar_fn::vtable::ExecutionArgs, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult - -pub fn vortex_row::encode::RowEncode::id(&self) -> vortex_array::scalar_fn::ScalarFnId - -pub fn vortex_row::encode::RowEncode::is_fallible(&self, &Self::Options) -> bool - -pub fn vortex_row::encode::RowEncode::is_null_sensitive(&self, &Self::Options) -> bool - -pub fn vortex_row::encode::RowEncode::return_dtype(&self, &Self::Options, &[vortex_array::dtype::DType]) -> vortex_error::VortexResult - -pub fn vortex_row::encode::RowEncode::serialize(&self, &Self::Options) -> vortex_error::VortexResult>> - -pub trait vortex_row::encode::RowEncodeKernel: vortex_array::array::vtable::VTable - -pub fn vortex_row::encode::RowEncodeKernel::row_encode_into(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> - -pub fn vortex_row::encode::dispatch_encode(&vortex_array::array::erased::ArrayRef, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()> - -pub mod vortex_row::options - -pub struct vortex_row::options::RowEncodeOptions - -pub vortex_row::options::RowEncodeOptions::fields: smallvec::SmallVec<[vortex_row::options::SortField; 4]> - -impl vortex_row::options::RowEncodeOptions - -pub fn vortex_row::options::RowEncodeOptions::new(impl core::iter::traits::collect::IntoIterator) -> Self - -impl core::clone::Clone for vortex_row::options::RowEncodeOptions - -pub fn vortex_row::options::RowEncodeOptions::clone(&self) -> vortex_row::options::RowEncodeOptions - -impl core::cmp::Eq for vortex_row::options::RowEncodeOptions - -impl core::cmp::PartialEq for vortex_row::options::RowEncodeOptions - -pub fn vortex_row::options::RowEncodeOptions::eq(&self, &vortex_row::options::RowEncodeOptions) -> bool - -impl core::fmt::Debug for vortex_row::options::RowEncodeOptions - -pub fn vortex_row::options::RowEncodeOptions::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result - -impl core::fmt::Display for vortex_row::options::RowEncodeOptions - -pub fn vortex_row::options::RowEncodeOptions::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result - -impl core::hash::Hash for vortex_row::options::RowEncodeOptions - -pub fn vortex_row::options::RowEncodeOptions::hash<__H: core::hash::Hasher>(&self, &mut __H) - -impl core::marker::StructuralPartialEq for vortex_row::options::RowEncodeOptions - -pub struct vortex_row::options::SortField - -pub vortex_row::options::SortField::descending: bool - -pub vortex_row::options::SortField::nulls_first: bool - -impl vortex_row::options::SortField - -pub fn vortex_row::options::SortField::new(bool, bool) -> Self - -pub fn vortex_row::options::SortField::non_null_sentinel(&self) -> u8 - -pub fn vortex_row::options::SortField::null_sentinel(&self) -> u8 - -impl core::clone::Clone for vortex_row::options::SortField - -pub fn vortex_row::options::SortField::clone(&self) -> vortex_row::options::SortField - -impl core::cmp::Eq for vortex_row::options::SortField - -impl core::cmp::PartialEq for vortex_row::options::SortField - -pub fn vortex_row::options::SortField::eq(&self, &vortex_row::options::SortField) -> bool - -impl core::default::Default for vortex_row::options::SortField - -pub fn vortex_row::options::SortField::default() -> Self - -impl core::fmt::Debug for vortex_row::options::SortField - -pub fn vortex_row::options::SortField::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result - -impl core::fmt::Display for vortex_row::options::SortField +pub struct vortex_row::RowEncode -pub fn vortex_row::options::SortField::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result +impl core::clone::Clone for vortex_row::RowEncode -impl core::hash::Hash for vortex_row::options::SortField +pub fn vortex_row::RowEncode::clone(&self) -> vortex_row::RowEncode -pub fn vortex_row::options::SortField::hash<__H: core::hash::Hasher>(&self, &mut __H) +impl core::fmt::Debug for vortex_row::RowEncode -impl core::marker::Copy for vortex_row::options::SortField +pub fn vortex_row::RowEncode::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl core::marker::StructuralPartialEq for vortex_row::options::SortField +impl vortex_array::scalar_fn::vtable::ScalarFnVTable for vortex_row::RowEncode -pub const vortex_row::options::FIELDS_INLINE: usize +pub type vortex_row::RowEncode::Options = vortex_row::RowEncodingOptions -pub mod vortex_row::size +pub fn vortex_row::RowEncode::arity(&self, &Self::Options) -> vortex_array::scalar_fn::vtable::Arity -pub struct vortex_row::size::RowSize +pub fn vortex_row::RowEncode::child_name(&self, &Self::Options, usize) -> vortex_array::scalar_fn::vtable::ChildName -impl core::clone::Clone for vortex_row::size::RowSize +pub fn vortex_row::RowEncode::deserialize(&self, &[u8], &vortex_session::VortexSession) -> vortex_error::VortexResult -pub fn vortex_row::size::RowSize::clone(&self) -> vortex_row::size::RowSize +pub fn vortex_row::RowEncode::execute(&self, &Self::Options, &dyn vortex_array::scalar_fn::vtable::ExecutionArgs, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult -impl core::fmt::Debug for vortex_row::size::RowSize +pub fn vortex_row::RowEncode::id(&self) -> vortex_array::scalar_fn::ScalarFnId -pub fn vortex_row::size::RowSize::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_row::RowEncode::is_fallible(&self, &Self::Options) -> bool -impl vortex_array::scalar_fn::vtable::ScalarFnVTable for vortex_row::size::RowSize +pub fn vortex_row::RowEncode::is_null_sensitive(&self, &Self::Options) -> bool -pub type vortex_row::size::RowSize::Options = vortex_row::options::RowEncodeOptions +pub fn vortex_row::RowEncode::return_dtype(&self, &Self::Options, &[vortex_array::dtype::DType]) -> vortex_error::VortexResult -pub fn vortex_row::size::RowSize::arity(&self, &Self::Options) -> vortex_array::scalar_fn::vtable::Arity +pub fn vortex_row::RowEncode::serialize(&self, &Self::Options) -> vortex_error::VortexResult>> -pub fn vortex_row::size::RowSize::child_name(&self, &Self::Options, usize) -> vortex_array::scalar_fn::vtable::ChildName +pub struct vortex_row::RowEncoder -pub fn vortex_row::size::RowSize::deserialize(&self, &[u8], &vortex_session::VortexSession) -> vortex_error::VortexResult +impl vortex_row::RowEncoder -pub fn vortex_row::size::RowSize::execute(&self, &Self::Options, &dyn vortex_array::scalar_fn::vtable::ExecutionArgs, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult +pub fn vortex_row::RowEncoder::encode(&self, &[vortex_array::array::erased::ArrayRef], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult -pub fn vortex_row::size::RowSize::id(&self) -> vortex_array::scalar_fn::ScalarFnId +pub fn vortex_row::RowEncoder::new(impl core::iter::traits::collect::IntoIterator) -> Self -pub fn vortex_row::size::RowSize::is_fallible(&self, &Self::Options) -> bool +pub fn vortex_row::RowEncoder::options(&self) -> core::option::Option<&vortex_row::RowEncodingOptions> -pub fn vortex_row::size::RowSize::is_null_sensitive(&self, &Self::Options) -> bool +pub fn vortex_row::RowEncoder::row_sizes(&self, &[vortex_array::array::erased::ArrayRef], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult -pub fn vortex_row::size::RowSize::return_dtype(&self, &Self::Options, &[vortex_array::dtype::DType]) -> vortex_error::VortexResult +pub fn vortex_row::RowEncoder::with_options(vortex_row::RowEncodingOptions) -> Self -pub fn vortex_row::size::RowSize::serialize(&self, &Self::Options) -> vortex_error::VortexResult>> +impl core::clone::Clone for vortex_row::RowEncoder -pub trait vortex_row::size::RowSizeKernel: vortex_array::array::vtable::VTable +pub fn vortex_row::RowEncoder::clone(&self) -> vortex_row::RowEncoder -pub fn vortex_row::size::RowSizeKernel::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> +impl core::cmp::Eq for vortex_row::RowEncoder -pub fn vortex_row::size::dispatch_size(&vortex_array::array::erased::ArrayRef, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()> +impl core::cmp::PartialEq for vortex_row::RowEncoder -pub struct vortex_row::RowEncode +pub fn vortex_row::RowEncoder::eq(&self, &vortex_row::RowEncoder) -> bool -impl core::clone::Clone for vortex_row::encode::RowEncode +impl core::default::Default for vortex_row::RowEncoder -pub fn vortex_row::encode::RowEncode::clone(&self) -> vortex_row::encode::RowEncode +pub fn vortex_row::RowEncoder::default() -> vortex_row::RowEncoder -impl core::fmt::Debug for vortex_row::encode::RowEncode +impl core::fmt::Debug for vortex_row::RowEncoder -pub fn vortex_row::encode::RowEncode::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_row::RowEncoder::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl vortex_array::scalar_fn::vtable::ScalarFnVTable for vortex_row::encode::RowEncode +impl core::hash::Hash for vortex_row::RowEncoder -pub type vortex_row::encode::RowEncode::Options = vortex_row::options::RowEncodeOptions +pub fn vortex_row::RowEncoder::hash<__H: core::hash::Hasher>(&self, &mut __H) -pub fn vortex_row::encode::RowEncode::arity(&self, &Self::Options) -> vortex_array::scalar_fn::vtable::Arity +impl core::marker::StructuralPartialEq for vortex_row::RowEncoder -pub fn vortex_row::encode::RowEncode::child_name(&self, &Self::Options, usize) -> vortex_array::scalar_fn::vtable::ChildName +pub struct vortex_row::RowEncodingOptions -pub fn vortex_row::encode::RowEncode::deserialize(&self, &[u8], &vortex_session::VortexSession) -> vortex_error::VortexResult +impl vortex_row::RowEncodingOptions -pub fn vortex_row::encode::RowEncode::execute(&self, &Self::Options, &dyn vortex_array::scalar_fn::vtable::ExecutionArgs, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult +pub fn vortex_row::RowEncodingOptions::default_for_columns(usize) -> Self -pub fn vortex_row::encode::RowEncode::id(&self) -> vortex_array::scalar_fn::ScalarFnId +pub fn vortex_row::RowEncodingOptions::fields(&self) -> &[vortex_row::RowSortField] -pub fn vortex_row::encode::RowEncode::is_fallible(&self, &Self::Options) -> bool +pub fn vortex_row::RowEncodingOptions::is_empty(&self) -> bool -pub fn vortex_row::encode::RowEncode::is_null_sensitive(&self, &Self::Options) -> bool +pub fn vortex_row::RowEncodingOptions::len(&self) -> usize -pub fn vortex_row::encode::RowEncode::return_dtype(&self, &Self::Options, &[vortex_array::dtype::DType]) -> vortex_error::VortexResult +pub fn vortex_row::RowEncodingOptions::new(impl core::iter::traits::collect::IntoIterator) -> Self -pub fn vortex_row::encode::RowEncode::serialize(&self, &Self::Options) -> vortex_error::VortexResult>> +impl core::clone::Clone for vortex_row::RowEncodingOptions -pub struct vortex_row::RowEncodeOptions +pub fn vortex_row::RowEncodingOptions::clone(&self) -> vortex_row::RowEncodingOptions -pub vortex_row::RowEncodeOptions::fields: smallvec::SmallVec<[vortex_row::options::SortField; 4]> +impl core::cmp::Eq for vortex_row::RowEncodingOptions -impl vortex_row::options::RowEncodeOptions +impl core::cmp::PartialEq for vortex_row::RowEncodingOptions -pub fn vortex_row::options::RowEncodeOptions::new(impl core::iter::traits::collect::IntoIterator) -> Self +pub fn vortex_row::RowEncodingOptions::eq(&self, &vortex_row::RowEncodingOptions) -> bool -impl core::clone::Clone for vortex_row::options::RowEncodeOptions +impl core::fmt::Debug for vortex_row::RowEncodingOptions -pub fn vortex_row::options::RowEncodeOptions::clone(&self) -> vortex_row::options::RowEncodeOptions +pub fn vortex_row::RowEncodingOptions::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl core::cmp::Eq for vortex_row::options::RowEncodeOptions +impl core::fmt::Display for vortex_row::RowEncodingOptions -impl core::cmp::PartialEq for vortex_row::options::RowEncodeOptions +pub fn vortex_row::RowEncodingOptions::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub fn vortex_row::options::RowEncodeOptions::eq(&self, &vortex_row::options::RowEncodeOptions) -> bool +impl core::hash::Hash for vortex_row::RowEncodingOptions -impl core::fmt::Debug for vortex_row::options::RowEncodeOptions +pub fn vortex_row::RowEncodingOptions::hash<__H: core::hash::Hasher>(&self, &mut __H) -pub fn vortex_row::options::RowEncodeOptions::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result +impl core::iter::traits::collect::FromIterator for vortex_row::RowEncodingOptions -impl core::fmt::Display for vortex_row::options::RowEncodeOptions +pub fn vortex_row::RowEncodingOptions::from_iter>(T) -> Self -pub fn vortex_row::options::RowEncodeOptions::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result +impl core::marker::StructuralPartialEq for vortex_row::RowEncodingOptions -impl core::hash::Hash for vortex_row::options::RowEncodeOptions - -pub fn vortex_row::options::RowEncodeOptions::hash<__H: core::hash::Hasher>(&self, &mut __H) +pub struct vortex_row::RowSize -impl core::marker::StructuralPartialEq for vortex_row::options::RowEncodeOptions +impl core::clone::Clone for vortex_row::RowSize -pub struct vortex_row::RowSize +pub fn vortex_row::RowSize::clone(&self) -> vortex_row::RowSize -impl core::clone::Clone for vortex_row::size::RowSize +impl core::fmt::Debug for vortex_row::RowSize -pub fn vortex_row::size::RowSize::clone(&self) -> vortex_row::size::RowSize +pub fn vortex_row::RowSize::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl core::fmt::Debug for vortex_row::size::RowSize +impl vortex_array::scalar_fn::vtable::ScalarFnVTable for vortex_row::RowSize -pub fn vortex_row::size::RowSize::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub type vortex_row::RowSize::Options = vortex_row::RowEncodingOptions -impl vortex_array::scalar_fn::vtable::ScalarFnVTable for vortex_row::size::RowSize +pub fn vortex_row::RowSize::arity(&self, &Self::Options) -> vortex_array::scalar_fn::vtable::Arity -pub type vortex_row::size::RowSize::Options = vortex_row::options::RowEncodeOptions +pub fn vortex_row::RowSize::child_name(&self, &Self::Options, usize) -> vortex_array::scalar_fn::vtable::ChildName -pub fn vortex_row::size::RowSize::arity(&self, &Self::Options) -> vortex_array::scalar_fn::vtable::Arity +pub fn vortex_row::RowSize::deserialize(&self, &[u8], &vortex_session::VortexSession) -> vortex_error::VortexResult -pub fn vortex_row::size::RowSize::child_name(&self, &Self::Options, usize) -> vortex_array::scalar_fn::vtable::ChildName +pub fn vortex_row::RowSize::execute(&self, &Self::Options, &dyn vortex_array::scalar_fn::vtable::ExecutionArgs, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult -pub fn vortex_row::size::RowSize::deserialize(&self, &[u8], &vortex_session::VortexSession) -> vortex_error::VortexResult +pub fn vortex_row::RowSize::id(&self) -> vortex_array::scalar_fn::ScalarFnId -pub fn vortex_row::size::RowSize::execute(&self, &Self::Options, &dyn vortex_array::scalar_fn::vtable::ExecutionArgs, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult +pub fn vortex_row::RowSize::is_fallible(&self, &Self::Options) -> bool -pub fn vortex_row::size::RowSize::id(&self) -> vortex_array::scalar_fn::ScalarFnId +pub fn vortex_row::RowSize::is_null_sensitive(&self, &Self::Options) -> bool -pub fn vortex_row::size::RowSize::is_fallible(&self, &Self::Options) -> bool +pub fn vortex_row::RowSize::return_dtype(&self, &Self::Options, &[vortex_array::dtype::DType]) -> vortex_error::VortexResult -pub fn vortex_row::size::RowSize::is_null_sensitive(&self, &Self::Options) -> bool +pub fn vortex_row::RowSize::serialize(&self, &Self::Options) -> vortex_error::VortexResult>> -pub fn vortex_row::size::RowSize::return_dtype(&self, &Self::Options, &[vortex_array::dtype::DType]) -> vortex_error::VortexResult +pub struct vortex_row::RowSortField -pub fn vortex_row::size::RowSize::serialize(&self, &Self::Options) -> vortex_error::VortexResult>> +pub vortex_row::RowSortField::descending: bool -pub struct vortex_row::SortField +pub vortex_row::RowSortField::nulls_first: bool -pub vortex_row::SortField::descending: bool +impl vortex_row::RowSortField -pub vortex_row::SortField::nulls_first: bool +pub const fn vortex_row::RowSortField::ascending() -> Self -impl vortex_row::options::SortField +pub const fn vortex_row::RowSortField::descending() -> Self -pub fn vortex_row::options::SortField::new(bool, bool) -> Self +pub const fn vortex_row::RowSortField::new(bool, bool) -> Self -pub fn vortex_row::options::SortField::non_null_sentinel(&self) -> u8 +pub const fn vortex_row::RowSortField::nulls_first(self) -> Self -pub fn vortex_row::options::SortField::null_sentinel(&self) -> u8 +pub const fn vortex_row::RowSortField::nulls_last(self) -> Self -impl core::clone::Clone for vortex_row::options::SortField +impl core::clone::Clone for vortex_row::RowSortField -pub fn vortex_row::options::SortField::clone(&self) -> vortex_row::options::SortField +pub fn vortex_row::RowSortField::clone(&self) -> vortex_row::RowSortField -impl core::cmp::Eq for vortex_row::options::SortField +impl core::cmp::Eq for vortex_row::RowSortField -impl core::cmp::PartialEq for vortex_row::options::SortField +impl core::cmp::PartialEq for vortex_row::RowSortField -pub fn vortex_row::options::SortField::eq(&self, &vortex_row::options::SortField) -> bool +pub fn vortex_row::RowSortField::eq(&self, &vortex_row::RowSortField) -> bool -impl core::default::Default for vortex_row::options::SortField +impl core::default::Default for vortex_row::RowSortField -pub fn vortex_row::options::SortField::default() -> Self +pub fn vortex_row::RowSortField::default() -> Self -impl core::fmt::Debug for vortex_row::options::SortField +impl core::fmt::Debug for vortex_row::RowSortField -pub fn vortex_row::options::SortField::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_row::RowSortField::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl core::fmt::Display for vortex_row::options::SortField +impl core::fmt::Display for vortex_row::RowSortField -pub fn vortex_row::options::SortField::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_row::RowSortField::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl core::hash::Hash for vortex_row::options::SortField +impl core::hash::Hash for vortex_row::RowSortField -pub fn vortex_row::options::SortField::hash<__H: core::hash::Hasher>(&self, &mut __H) +pub fn vortex_row::RowSortField::hash<__H: core::hash::Hasher>(&self, &mut __H) -impl core::marker::Copy for vortex_row::options::SortField +impl core::iter::traits::collect::FromIterator for vortex_row::RowEncodingOptions -impl core::marker::StructuralPartialEq for vortex_row::options::SortField +pub fn vortex_row::RowEncodingOptions::from_iter>(T) -> Self -pub trait vortex_row::RowEncodeKernel: vortex_array::array::vtable::VTable +impl core::marker::Copy for vortex_row::RowSortField -pub fn vortex_row::RowEncodeKernel::row_encode_into(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> +impl core::marker::StructuralPartialEq for vortex_row::RowSortField -pub trait vortex_row::RowSizeKernel: vortex_array::array::vtable::VTable +pub fn vortex_row::compute_row_sizes(&[vortex_array::array::erased::ArrayRef], &[vortex_row::RowSortField], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult -pub fn vortex_row::RowSizeKernel::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> +pub fn vortex_row::compute_row_sizes_with_options(&[vortex_array::array::erased::ArrayRef], &vortex_row::RowEncodingOptions, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult -pub fn vortex_row::compute_row_sizes(&[vortex_array::array::erased::ArrayRef], &[vortex_row::options::SortField], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult +pub fn vortex_row::convert_columns(&[vortex_array::array::erased::ArrayRef], &[vortex_row::RowSortField], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult -pub fn vortex_row::convert_columns(&[vortex_array::array::erased::ArrayRef], &[vortex_row::options::SortField], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult +pub fn vortex_row::convert_columns_with_options(&[vortex_array::array::erased::ArrayRef], &vortex_row::RowEncodingOptions, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult pub fn vortex_row::initialize(&vortex_session::VortexSession) diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs index 8468301e5b3..33270b0ad43 100644 --- a/vortex-row/src/codec.rs +++ b/vortex-row/src/codec.rs @@ -1,18 +1,12 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -#![allow( - clippy::cast_possible_truncation, - clippy::expect_used, - reason = "row encoding indexes into u32-sized buffers; lengths are validated to fit in u32 elsewhere" -)] - //! Pure byte-encoding kernels for row-oriented output, operating on `Canonical` variants. //! //! The encoded byte format produces a lexicographically byte-comparable representation: //! comparing the byte slices of two encoded rows yields the same ordering as the //! original logical (tuple) comparison of their values, modulo nulls placement and -//! descending-ness as configured by [`SortField`]. +//! descending-ness as configured by [`RowSortField`]. //! //! Conventions: //! - Every value is preceded by a 1-byte sentinel that orders nulls relative to non-nulls. @@ -21,9 +15,6 @@ //! - Fixed-width integers are big-endian, with the sign bit flipped for signed types. //! - Floats are bit-pattern big-endian with sign-aware mask: non-negative flips the top //! bit; negative flips all bits. -//! -//! This commit covers only the fixed-width canonical variants (Null, Bool, Primitive, -//! Decimal); variable-length and nested canonical variants land in later commits. use vortex_array::Canonical; use vortex_array::ExecutionCtx; @@ -42,22 +33,22 @@ use vortex_array::arrays::struct_::StructArrayExt; use vortex_array::dtype::DType; use vortex_array::dtype::DecimalType; use vortex_array::dtype::NativePType; -use vortex_array::dtype::PType; use vortex_array::dtype::half::f16; use vortex_array::match_each_native_ptype; -use vortex_buffer::ByteBufferMut; +use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_bail; -use crate::options::SortField; +use crate::options::RowSortField; /// Size in bytes of the encoded form of a single bool value (sentinel + 1 content byte). -pub const BOOL_ENCODED_SIZE: u32 = 2; +pub(crate) const BOOL_ENCODED_SIZE: u32 = 2; /// Block size used in the variable-length encoding. -pub const VARLEN_BLOCK_SIZE: usize = 32; +pub(crate) const VARLEN_BLOCK_SIZE: usize = 32; /// Total bytes per varlen block including the trailing continuation marker. -pub const VARLEN_BLOCK_TOTAL: usize = VARLEN_BLOCK_SIZE + 1; +pub(crate) const VARLEN_BLOCK_TOTAL: usize = VARLEN_BLOCK_SIZE + 1; +const VARLEN_BLOCK_TOTAL_U32: u32 = 33; /// Returns the size in bytes of the encoded form of a variable-length value of the given length. #[inline] @@ -66,8 +57,9 @@ fn encoded_size_for_varlen(len: usize) -> u32 { if len == 0 { 1 + 1 } else { - let blocks = len.div_ceil(VARLEN_BLOCK_SIZE); - 1 + (blocks as u32) * (VARLEN_BLOCK_TOTAL as u32) + let blocks = u32::try_from(len.div_ceil(VARLEN_BLOCK_SIZE)) + .vortex_expect("varlen block count must fit in u32"); + 1 + blocks * VARLEN_BLOCK_TOTAL_U32 } } @@ -77,13 +69,17 @@ const fn encoded_size_for_fixed(value_bytes: u32) -> u32 { 1 + value_bytes } +fn byte_width_u32(width: usize) -> u32 { + u32::try_from(width).vortex_expect("native byte width must fit in u32") +} + /// Per-row width classification for a column. /// /// `Fixed(w)` means every row encodes to exactly `w` bytes (sentinel + value), regardless /// of null-ness or value. `Variable` means per-row sizes depend on the data (Utf8/Binary, /// List, or any composite that recurses through a variable-width field). #[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum RowWidth { +pub(crate) enum RowWidth { /// Per-row width is the same constant for every row in the column. Fixed(u32), /// Per-row width is data-dependent. @@ -96,26 +92,24 @@ pub enum RowWidth { /// regardless of null-ness or value. Returns `Variable` when per-row sizes depend on the /// data. /// -/// Classification does not depend on the [`SortField`]: null-vs-non-null encoding width is +/// Classification does not depend on the [`RowSortField`]: null-vs-non-null encoding width is /// the same for fixed-width types (the sentinel byte plus zero-fill for nulls). /// /// # Errors /// -/// Returns an error for dtypes that the row encoder does not yet support. Variable-length -/// dtypes (Utf8/Binary), nested dtypes (Struct/FixedSizeList/Extension), and -/// Variant/Union/List arrive in later commits. -pub fn row_width_for_dtype(dtype: &DType) -> VortexResult { +/// Returns an error for dtypes that the row encoder does not support. +pub(crate) fn row_width_for_dtype(dtype: &DType) -> VortexResult { match dtype { DType::Null => Ok(RowWidth::Fixed(1)), DType::Bool(_) => Ok(RowWidth::Fixed(BOOL_ENCODED_SIZE)), - DType::Primitive(ptype, _) => Ok(RowWidth::Fixed(encoded_size_for_fixed( - ptype.byte_width() as u32, - ))), + DType::Primitive(ptype, _) => Ok(RowWidth::Fixed(encoded_size_for_fixed(byte_width_u32( + ptype.byte_width(), + )))), DType::Decimal(dt, _) => { let vt = DecimalType::smallest_decimal_value_type(dt); - Ok(RowWidth::Fixed(encoded_size_for_fixed( - vt.byte_width() as u32 - ))) + Ok(RowWidth::Fixed(encoded_size_for_fixed(byte_width_u32( + vt.byte_width(), + )))) } DType::Utf8(_) | DType::Binary(_) => Ok(RowWidth::Variable), DType::FixedSizeList(elem, n, _) => match row_width_for_dtype(elem)? { @@ -154,11 +148,10 @@ pub fn row_width_for_dtype(dtype: &DType) -> VortexResult { /// /// # Errors /// -/// Returns an error for unsupported canonical variants. Variable-length and nested -/// variants land in later commits. -pub fn field_size( +/// Returns an error for unsupported canonical variants. +pub(crate) fn field_size( canonical: &Canonical, - field: SortField, + field: RowSortField, sizes: &mut [u32], ctx: &mut ExecutionCtx, ) -> VortexResult<()> { @@ -172,7 +165,7 @@ pub fn field_size( Canonical::FixedSizeList(arr) => add_size_fsl(arr, field, sizes, ctx)?, Canonical::Extension(arr) => add_size_extension(arr, field, sizes, ctx)?, Canonical::List(_) => vortex_bail!( - "row encoding does not yet support canonical type {:?}", + "row encoding does not support canonical List arrays: {:?}", canonical.dtype() ), Canonical::Variant(_) => { @@ -188,9 +181,9 @@ pub fn field_size( /// /// After this call returns successfully, `cursors[i]` will have advanced by exactly the /// per-row contribution previously computed by [`field_size`] for the same column. -pub fn field_encode( +pub(crate) fn field_encode( canonical: &Canonical, - field: SortField, + field: RowSortField, offsets: &[u32], cursors: &mut [u32], out: &mut [u8], @@ -206,7 +199,7 @@ pub fn field_encode( Canonical::FixedSizeList(arr) => encode_fsl(arr, field, offsets, cursors, out, ctx)?, Canonical::Extension(arr) => encode_extension(arr, field, offsets, cursors, out, ctx)?, Canonical::List(_) => vortex_bail!( - "row encoding does not yet support canonical type {:?}", + "row encoding does not support canonical List arrays: {:?}", canonical.dtype() ), Canonical::Variant(_) => { @@ -231,12 +224,12 @@ fn add_size_null(arr: &NullArray, sizes: &mut [u32]) { } fn add_size_primitive(arr: &PrimitiveArray, sizes: &mut [u32]) { - let width = arr.ptype().byte_width() as u32; + let width = byte_width_u32(arr.ptype().byte_width()); add_size_const(sizes, encoded_size_for_fixed(width)); } fn add_size_decimal(arr: &DecimalArray, sizes: &mut [u32]) { - let width = arr.values_type().byte_width() as u32; + let width = byte_width_u32(arr.values_type().byte_width()); add_size_const(sizes, encoded_size_for_fixed(width)); } @@ -261,7 +254,7 @@ fn add_size_varbinview( fn add_size_struct( arr: &StructArray, - field: SortField, + field: RowSortField, sizes: &mut [u32], ctx: &mut ExecutionCtx, ) -> VortexResult<()> { @@ -279,7 +272,7 @@ fn add_size_struct( fn add_size_fsl( arr: &FixedSizeListArray, - field: SortField, + field: RowSortField, sizes: &mut [u32], ctx: &mut ExecutionCtx, ) -> VortexResult<()> { @@ -305,7 +298,7 @@ fn add_size_fsl( fn add_size_extension( arr: &ExtensionArray, - field: SortField, + field: RowSortField, sizes: &mut [u32], ctx: &mut ExecutionCtx, ) -> VortexResult<()> { @@ -315,7 +308,7 @@ fn add_size_extension( fn encode_null( arr: &NullArray, - field: SortField, + field: RowSortField, row_offsets: &[u32], col_offset: &mut [u32], out: &mut [u8], @@ -330,7 +323,7 @@ fn encode_null( fn encode_bool( arr: &BoolArray, - field: SortField, + field: RowSortField, row_offsets: &[u32], col_offset: &mut [u32], out: &mut [u8], @@ -359,7 +352,7 @@ fn encode_bool( fn encode_primitive( arr: &PrimitiveArray, - field: SortField, + field: RowSortField, row_offsets: &[u32], col_offset: &mut [u32], out: &mut [u8], @@ -373,7 +366,7 @@ fn encode_primitive( fn encode_primitive_typed( arr: &PrimitiveArray, - field: SortField, + field: RowSortField, row_offsets: &[u32], col_offset: &mut [u32], out: &mut [u8], @@ -396,14 +389,14 @@ fn encode_primitive_typed( *b = 0; } } - col_offset[i] += encoded_size_for_fixed(value_bytes as u32); + col_offset[i] += encoded_size_for_fixed(byte_width_u32(value_bytes)); } Ok(()) } fn encode_decimal( arr: &DecimalArray, - field: SortField, + field: RowSortField, row_offsets: &[u32], col_offset: &mut [u32], out: &mut [u8], @@ -436,7 +429,7 @@ fn encode_decimal( fn encode_decimal_typed( arr: &DecimalArray, mask: &vortex_mask::Mask, - field: SortField, + field: RowSortField, row_offsets: &[u32], col_offset: &mut [u32], out: &mut [u8], @@ -446,7 +439,7 @@ fn encode_decimal_typed( let non_null = field.non_null_sentinel(); let null = field.null_sentinel(); let value_bytes = size_of::(); - let total = encoded_size_for_fixed(value_bytes as u32); + let total = encoded_size_for_fixed(byte_width_u32(value_bytes)); let slice = arr.buffer::(); for i in 0..slice.len() { let pos = (row_offsets[i] + col_offset[i]) as usize; @@ -465,7 +458,7 @@ fn encode_decimal_typed( fn encode_varbinview( arr: &VarBinViewArray, - field: SortField, + field: RowSortField, row_offsets: &[u32], col_offset: &mut [u32], out: &mut [u8], @@ -494,7 +487,7 @@ fn encode_varbinview( fn encode_struct( arr: &StructArray, - field: SortField, + field: RowSortField, row_offsets: &[u32], col_offset: &mut [u32], out: &mut [u8], @@ -545,7 +538,7 @@ fn encode_struct( fn encode_fsl( arr: &FixedSizeListArray, - field: SortField, + field: RowSortField, row_offsets: &[u32], col_offset: &mut [u32], out: &mut [u8], @@ -608,7 +601,7 @@ fn encode_fsl( fn encode_extension( arr: &ExtensionArray, - field: SortField, + field: RowSortField, row_offsets: &[u32], col_offset: &mut [u32], out: &mut [u8], @@ -647,16 +640,17 @@ fn encode_varlen_value(bytes: &[u8], out: &mut [u8], descending: bool) -> u32 { for j in n..VARLEN_BLOCK_SIZE { out[written + j] = xor; } - out[written + VARLEN_BLOCK_SIZE] = (n as u8) ^ xor; + out[written + VARLEN_BLOCK_SIZE] = + u8::try_from(n).vortex_expect("final varlen block length must fit in u8") ^ xor; written += VARLEN_BLOCK_TOTAL; - written as u32 + u32::try_from(written).vortex_expect("encoded varlen byte length must fit in u32") } /// Internal trait for encoding a fixed-width native value into byte slots. /// /// Implementations must produce a sequence of `size_of::()` bytes that is /// lexicographically byte-comparable according to the natural ordering of the type. -pub trait RowEncode: Copy { +pub(crate) trait RowEncode: Copy { /// Encode this value into `out`, inverting the bytes for descending order. fn encode_to(self, out: &mut [u8], descending: bool); } @@ -758,238 +752,3 @@ impl RowEncode for f16 { out.copy_from_slice(&bytes); } } - -/// Encode a single scalar primitive value of a known PType into a buffer slot. -pub fn encode_scalar_primitive( - ptype: PType, - value: vortex_array::scalar::PValue, - field: SortField, - is_null: bool, - out: &mut ByteBufferMut, -) -> VortexResult<()> { - if is_null { - out.push(field.null_sentinel()); - return Ok(()); - } - out.push(field.non_null_sentinel()); - let width = ptype.byte_width(); - let mut tmp = [0u8; 16]; - let buf = &mut tmp[..width]; - match_each_native_ptype!( - ptype, - integral: |T| { - let v: T = T::try_from(value)?; - v.encode_to(buf, field.descending); - }, - floating: |T| { - let v: T = T::try_from(value)?; - v.encode_to(buf, field.descending); - } - ); - out.extend_from_slice(buf); - Ok(()) -} - -/// Encode a single varlen value into a buffer. -pub fn encode_scalar_varlen(value: Option<&[u8]>, field: SortField, out: &mut ByteBufferMut) { - match value { - None => out.push(field.null_sentinel()), - Some(bytes) => { - out.push(field.non_null_sentinel()); - let needed = if bytes.is_empty() { - 1 - } else { - bytes.len().div_ceil(VARLEN_BLOCK_SIZE) * VARLEN_BLOCK_TOTAL - }; - let start = out.len(); - for _ in 0..needed { - out.push(0); - } - let written = encode_varlen_value(bytes, &mut out[start..], field.descending); - debug_assert_eq!(written as usize, needed); - } - } -} - -/// Encode a single boolean value. -pub fn encode_scalar_bool(value: Option, field: SortField, out: &mut ByteBufferMut) { - match value { - None => { - out.push(field.null_sentinel()); - out.push(0); - } - Some(b) => { - out.push(field.non_null_sentinel()); - let raw = if b { 0x02u8 } else { 0x01u8 }; - let xor = if field.descending { 0xFFu8 } else { 0 }; - out.push(raw ^ xor); - } - } -} - -/// Encode a single null-type value (only the sentinel). -pub fn encode_scalar_null(field: SortField, is_null: bool, out: &mut ByteBufferMut) { - if is_null { - out.push(field.null_sentinel()); - } else { - out.push(field.non_null_sentinel()); - } -} - -/// Returns the per-row encoded size for a scalar value (used for the Constant fast path). -pub fn encoded_size_for_scalar( - scalar: &vortex_array::scalar::Scalar, - _field: SortField, -) -> VortexResult { - if scalar.is_null() { - match scalar.dtype() { - DType::Null => Ok(1), - DType::Bool(_) => Ok(BOOL_ENCODED_SIZE), - DType::Primitive(ptype, _) => Ok(encoded_size_for_fixed(ptype.byte_width() as u32)), - DType::Decimal(dt, _) => { - let vt = DecimalType::smallest_decimal_value_type(dt); - Ok(encoded_size_for_fixed(vt.byte_width() as u32)) - } - DType::Utf8(_) | DType::Binary(_) => Ok(1), - _ => vortex_bail!( - "unsupported scalar dtype for row encoding: {}", - scalar.dtype() - ), - } - } else { - match scalar.dtype() { - DType::Null => Ok(1), - DType::Bool(_) => Ok(BOOL_ENCODED_SIZE), - DType::Primitive(ptype, _) => Ok(encoded_size_for_fixed(ptype.byte_width() as u32)), - DType::Decimal(..) => { - let dec = scalar.as_decimal(); - let vt = dec - .decimal_value() - .map(|v| v.decimal_type()) - .unwrap_or(DecimalType::I128); - Ok(encoded_size_for_fixed(vt.byte_width() as u32)) - } - DType::Utf8(_) => { - let bs = scalar - .as_utf8() - .value() - .map(|s| s.as_str().len()) - .unwrap_or(0); - Ok(encoded_size_for_varlen(bs)) - } - DType::Binary(_) => { - let bs = scalar.as_binary().value().map(|b| b.len()).unwrap_or(0); - Ok(encoded_size_for_varlen(bs)) - } - _ => vortex_bail!( - "unsupported scalar dtype for row encoding: {}", - scalar.dtype() - ), - } - } -} - -/// Encode a single scalar value into a fresh `Bytes` buffer. -pub fn encode_scalar( - scalar: &vortex_array::scalar::Scalar, - field: SortField, -) -> VortexResult { - use vortex_array::scalar::PValue; - let size = encoded_size_for_scalar(scalar, field)? as usize; - let mut out = ByteBufferMut::with_capacity(size); - if scalar.is_null() { - match scalar.dtype() { - DType::Null => out.push(field.null_sentinel()), - DType::Bool(_) => { - out.push(field.null_sentinel()); - out.push(0); - } - DType::Primitive(ptype, _) => { - out.push(field.null_sentinel()); - let width = ptype.byte_width(); - for _ in 0..width { - out.push(0); - } - } - DType::Decimal(dt, _) => { - out.push(field.null_sentinel()); - let vt = DecimalType::smallest_decimal_value_type(dt); - for _ in 0..vt.byte_width() { - out.push(0); - } - } - DType::Utf8(_) | DType::Binary(_) => out.push(field.null_sentinel()), - _ => vortex_bail!( - "unsupported scalar dtype for row encoding: {}", - scalar.dtype() - ), - } - } else { - match scalar.dtype() { - DType::Null => out.push(field.non_null_sentinel()), - DType::Bool(_) => { - let v = scalar.as_bool().value().unwrap_or(false); - encode_scalar_bool(Some(v), field, &mut out); - } - DType::Primitive(ptype, _) => { - let v: PValue = scalar - .as_primitive() - .pvalue() - .ok_or_else(|| vortex_error::vortex_err!("missing primitive value"))?; - encode_scalar_primitive(*ptype, v, field, false, &mut out)?; - } - DType::Decimal(..) => { - let dec = scalar.as_decimal(); - out.push(field.non_null_sentinel()); - let value = dec - .decimal_value() - .ok_or_else(|| vortex_error::vortex_err!("missing decimal value"))?; - match value { - vortex_array::scalar::DecimalValue::I8(v) => { - let mut tmp = [0u8; 1]; - v.encode_to(&mut tmp, field.descending); - out.extend_from_slice(&tmp); - } - vortex_array::scalar::DecimalValue::I16(v) => { - let mut tmp = [0u8; 2]; - v.encode_to(&mut tmp, field.descending); - out.extend_from_slice(&tmp); - } - vortex_array::scalar::DecimalValue::I32(v) => { - let mut tmp = [0u8; 4]; - v.encode_to(&mut tmp, field.descending); - out.extend_from_slice(&tmp); - } - vortex_array::scalar::DecimalValue::I64(v) => { - let mut tmp = [0u8; 8]; - v.encode_to(&mut tmp, field.descending); - out.extend_from_slice(&tmp); - } - vortex_array::scalar::DecimalValue::I128(v) => { - let mut tmp = [0u8; 16]; - v.encode_to(&mut tmp, field.descending); - out.extend_from_slice(&tmp); - } - vortex_array::scalar::DecimalValue::I256(_) => { - vortex_bail!("row encoding for Decimal256 is not yet implemented") - } - } - } - DType::Utf8(_) => { - let v = scalar.as_utf8(); - let bytes = v.value().map(|s| s.as_str().as_bytes()).unwrap_or(&[]); - encode_scalar_varlen(Some(bytes), field, &mut out); - } - DType::Binary(_) => { - let v = scalar.as_binary(); - let bytes = v.value().map(|b| b.as_slice()).unwrap_or(&[]); - encode_scalar_varlen(Some(bytes), field, &mut out); - } - _ => vortex_bail!( - "unsupported scalar dtype for row encoding: {}", - scalar.dtype() - ), - } - } - Ok(out.freeze().into_inner()) -} diff --git a/vortex-row/src/convert.rs b/vortex-row/src/convert.rs deleted file mode 100644 index c3b06d92748..00000000000 --- a/vortex-row/src/convert.rs +++ /dev/null @@ -1,75 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! User-facing entry point: turn N columnar arrays into one row-encoded `ListView`. - -use vortex_array::ArrayRef; -use vortex_array::ExecutionCtx; -use vortex_array::arrays::ListViewArray; -use vortex_array::scalar_fn::ScalarFnVTable; -use vortex_array::scalar_fn::VecExecutionArgs; -use vortex_error::VortexResult; -use vortex_error::vortex_bail; - -use crate::encode::RowEncode; -use crate::options::RowEncodeOptions; -use crate::options::SortField; -use crate::size::RowSize; - -/// Convert N columnar arrays into a single row-oriented [`ListViewArray`] of `u8` whose -/// bytes are lexicographically comparable in the same order as a tuple comparison of the -/// input values according to `fields`. -pub fn convert_columns( - cols: &[ArrayRef], - fields: &[SortField], - ctx: &mut ExecutionCtx, -) -> VortexResult { - if cols.len() != fields.len() { - vortex_bail!( - "convert_columns: cols.len() ({}) does not match fields.len() ({})", - cols.len(), - fields.len() - ); - } - if cols.is_empty() { - vortex_bail!("convert_columns: at least one column is required"); - } - let nrows = cols[0].len(); - for (i, col) in cols.iter().enumerate() { - if col.len() != nrows { - vortex_bail!( - "convert_columns: column {} has length {} but expected {}", - i, - col.len(), - nrows - ); - } - } - - let options = RowEncodeOptions::new(fields.iter().copied()); - let args = VecExecutionArgs::new(cols.to_vec(), nrows); - let result = RowEncode.execute(&options, &args, ctx)?; - result.execute::(ctx) -} - -/// Compute only the per-row sizes (in bytes) of the row-encoded form for N columns. -pub fn compute_row_sizes( - cols: &[ArrayRef], - fields: &[SortField], - ctx: &mut ExecutionCtx, -) -> VortexResult { - if cols.len() != fields.len() { - vortex_bail!( - "compute_row_sizes: cols.len() ({}) does not match fields.len() ({})", - cols.len(), - fields.len() - ); - } - if cols.is_empty() { - vortex_bail!("compute_row_sizes: at least one column is required"); - } - let nrows = cols[0].len(); - let options = RowEncodeOptions::new(fields.iter().copied()); - let args = VecExecutionArgs::new(cols.to_vec(), nrows); - RowSize.execute(&options, &args, ctx) -} diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs index 1b77d955964..4bc4962503e 100644 --- a/vortex-row/src/encode.rs +++ b/vortex-row/src/encode.rs @@ -1,11 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -#![allow( - clippy::cast_possible_truncation, - reason = "row encoding indexes into u32-sized buffers; lengths are validated to fit in u32" -)] - //! `RowEncode` variadic scalar function: encode N input columns into a single `ListView`. //! //! The output's `(elements, offsets, sizes)` triple is built up in a single left-to-right @@ -16,11 +11,9 @@ use std::sync::Arc; use vortex_array::ArrayRef; -use vortex_array::ArrayView; use vortex_array::Canonical; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; -use vortex_array::VTable; use vortex_array::arrays::ListViewArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::dtype::DType; @@ -40,27 +33,31 @@ use vortex_error::vortex_bail; use vortex_session::VortexSession; use crate::codec; -use crate::options::RowEncodeOptions; -use crate::options::SortField; -use crate::options::deserialize_row_encode_options; -use crate::options::serialize_row_encode_options; +use crate::options::RowEncodingOptions; +use crate::options::RowSortField; +use crate::options::deserialize_row_encoding_options; +use crate::options::serialize_row_encoding_options; use crate::size::compute_sizes; /// Variadic scalar function that encodes N input columns into a single `List` /// [`ListViewArray`] where row `i` contains the row-encoded bytes for column values /// `cols[0][i], cols[1][i], ...` concatenated left-to-right. +/// +/// This scalar function is public for session registration and encoding extension work. +/// Most callers should use [`RowEncoder`](crate::RowEncoder) rather than invoking the scalar +/// function directly. #[derive(Clone, Debug)] pub struct RowEncode; impl ScalarFnVTable for RowEncode { - type Options = RowEncodeOptions; + type Options = RowEncodingOptions; fn id(&self) -> ScalarFnId { ScalarFnId::from("vortex.row_encode") } fn serialize(&self, options: &Self::Options) -> VortexResult>> { - Ok(Some(serialize_row_encode_options(options))) + Ok(Some(serialize_row_encoding_options(options))) } fn deserialize( @@ -68,7 +65,7 @@ impl ScalarFnVTable for RowEncode { metadata: &[u8], _session: &VortexSession, ) -> VortexResult { - deserialize_row_encode_options(metadata) + deserialize_row_encoding_options(metadata) } fn arity(&self, _options: &Self::Options) -> Arity { @@ -105,7 +102,7 @@ impl ScalarFnVTable for RowEncode { } fn execute_row_encode( - options: &RowEncodeOptions, + options: &RowEncodingOptions, args: &dyn ExecutionArgs, ctx: &mut ExecutionCtx, ) -> VortexResult { @@ -115,10 +112,8 @@ fn execute_row_encode( let crate::size::SizePassResult { fixed_per_row, var_lengths, - col_kinds: _, - first_varlen_idx: _, columns, - } = compute_sizes(options, args, ctx, "RowEncode")?; + } = compute_sizes(options, args, ctx)?; // ===== Phase 2: totals + buffer ===== let var_total: u64 = var_lengths @@ -131,12 +126,11 @@ fn execute_row_encode( if total > u32::MAX as u64 { vortex_bail!("row-encoded output size {} bytes exceeds u32::MAX", total); } - let total_len = total as usize; + let total_len = + usize::try_from(total).vortex_expect("validated row-encoded output size must fit usize"); // Allocate the elements buffer (zero-initialized). The zero-init lets every encoder - // assume previously-untouched bytes are zero, simplifying the null-row fill paths. - // PR 2 skips this memset because every byte in the output range is written by some - // encoder. + // assume previously untouched bytes are zero, simplifying the null-row fill paths. let mut out_buf: BufferMut = BufferMut::with_capacity(total_len); out_buf.push_n(0u8, total_len); @@ -148,8 +142,10 @@ fn execute_row_encode( match var_lengths.as_ref() { None => { for i in 0..nrows { + let row_idx = + u32::try_from(i).vortex_expect("row index must fit in u32 after validation"); listview_offsets.push( - (i as u32) + row_idx .checked_mul(fixed_per_row) .vortex_expect("row offset overflow (already validated total fits in u32)"), ); @@ -158,7 +154,9 @@ fn execute_row_encode( Some(v) => { let mut acc: u32 = 0; for (i, &l) in v.iter().enumerate() { - let off = (i as u32) + let row_idx = + u32::try_from(i).vortex_expect("row index must fit in u32 after validation"); + let off = row_idx .checked_mul(fixed_per_row) .and_then(|t| t.checked_add(acc)) .vortex_expect("row offset overflow"); @@ -201,14 +199,13 @@ fn execute_row_encode( ) } -/// Dispatch a single column's encoding into the shared `out` buffer. +/// Dispatch a single column's encoding into the shared `out` buffer through the canonical path. /// -/// For PR 1 this is just the canonicalize-then-`codec::field_encode` fallback path. -/// In-crate fast paths for `Constant`/`Dict`/`Patched` and the inventory-based registry -/// for downstream encodings are added in PR 3. -pub fn dispatch_encode( +/// TODO(row): add per-encoding fast paths here so Constant, Dictionary, and compressed arrays +/// can write row bytes without canonicalizing. +pub(crate) fn dispatch_encode( col: &ArrayRef, - field: SortField, + field: RowSortField, offsets: &[u32], cursors: &mut [u32], out: &mut [u8], @@ -217,22 +214,3 @@ pub fn dispatch_encode( let canonical = col.clone().execute::(ctx)?; codec::field_encode(&canonical, field, offsets, cursors, out, ctx) } - -/// Mutate-buffer kernel: write this column's per-row bytes into `out` at -/// `offsets[i] + cursors[i]`, advancing `cursors[i]` by the bytes written. -/// -/// Return `Ok(None)` to decline and fall back to the canonical path. -/// -/// Trait is defined now; per-encoding impls and dispatch wiring land in PR 3. -pub trait RowEncodeKernel: VTable { - /// Write this column's per-row bytes into `out` at `offsets[i] + cursors[i]`, advancing - /// `cursors[i]` by the bytes written. - fn row_encode_into( - column: ArrayView<'_, Self>, - field: SortField, - offsets: &[u32], - cursors: &mut [u32], - out: &mut [u8], - ctx: &mut ExecutionCtx, - ) -> VortexResult>; -} diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs index fddcca665c1..d921e2998e3 100644 --- a/vortex-row/src/lib.rs +++ b/vortex-row/src/lib.rs @@ -1,55 +1,59 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Row-oriented byte encoder, analogous to Apache Arrow's `arrow-row` crate. +//! Row-oriented byte encoding for Vortex arrays. //! -//! The encoder converts N columnar arrays into a single `List` array where each row's -//! bytes are lexicographically comparable in the same order as a tuple comparison of the -//! original values. This is useful for sorting, hashing into row containers, and other -//! operations that benefit from a sort-friendly opaque byte representation of a multi-column -//! key. +//! This crate converts one or more columnar arrays into a single `ListView` array whose +//! row byte slices can be compared lexicographically. The byte ordering matches tuple +//! ordering of the input values under the requested [`RowSortField`] settings, making the +//! representation useful for sort keys and other row-key operations. //! -//! Two variadic scalar functions drive the implementation: -//! - [`RowSize`] computes per-row byte sizes across all N input columns. -//! - [`RowEncode`] writes the row-encoded bytes into a single `ListView` accumulator -//! in one left-to-right pass. +//! The public entry points are: +//! - [`RowEncoder`], the primary API for encoding columns into row bytes. +//! - [`RowEncoder::row_sizes`], which computes the fixed and variable byte contributions +//! without materializing the encoded rows. +//! - [`convert_columns`] and [`compute_row_sizes`], compatibility helpers around +//! [`RowEncoder`]. +//! - [`initialize`], which registers the [`RowSize`] and [`RowEncode`] scalar functions on a +//! [`VortexSession`]. //! -//! Each scalar function exposes a per-encoding fast-path trait -//! ([`RowSizeKernel`] / [`RowEncodeKernel`]) for downstream encodings to plug into; PR 3 -//! adds in-crate impls for `Constant`, `Dict`, and `Patched` and an inventory-based -//! registry for external encodings. +//! Internally, encoding is split into two scalar functions. [`RowSize`] performs the sizing +//! pass and classifies fixed-width versus variable-width input columns. [`RowEncode`] uses +//! those sizes to allocate one contiguous elements buffer, then writes each column's bytes +//! into the per-row slots from left to right. //! -//! The user-facing entry point is [`convert_columns`]. -//! -//! Row-encoding scalar functions are not registered in the default -//! [`VortexSession`]. Call [`initialize`] on a session to make `RowSize` and `RowEncode` -//! available via the expression layer. +//! Supported logical types are nulls, booleans, primitive integers and floats, decimals up to +//! 128 bits, UTF-8 and binary values, structs, fixed-size lists, and extensions whose storage +//! type is supported. Variant, union, and variable-size list arrays are rejected because this +//! crate does not define an ordering for them. -pub mod codec; -pub mod convert; -pub mod encode; -pub mod options; -pub mod size; +mod codec; +mod encode; +mod encoder; +mod options; +mod size; #[cfg(test)] mod tests; -pub use convert::compute_row_sizes; -pub use convert::convert_columns; pub use encode::RowEncode; -pub use encode::RowEncodeKernel; -pub use options::RowEncodeOptions; -pub use options::SortField; +pub use encoder::RowEncoder; +pub use encoder::compute_row_sizes; +pub use encoder::compute_row_sizes_with_options; +pub use encoder::convert_columns; +pub use encoder::convert_columns_with_options; +pub use options::RowEncodingOptions; +pub use options::RowSortField; pub use size::RowSize; -pub use size::RowSizeKernel; use vortex_array::scalar_fn::session::ScalarFnSessionExt; use vortex_session::VortexSession; /// Register the row-encoding scalar functions ([`RowSize`] and [`RowEncode`]) on the given /// session. /// -/// Call once on session construction if you want row encoding available via the expression -/// layer or via [`convert_columns`]. +/// Call this during session construction when row encoding must be available through the +/// expression layer. The direct [`RowEncoder`] API constructs the scalar-function calls +/// itself and does not require global registration. pub fn initialize(session: &VortexSession) { session.scalar_fns().register(RowSize); session.scalar_fns().register(RowEncode); diff --git a/vortex-row/src/options.rs b/vortex-row/src/options.rs index a9e5e2b18ab..380c9a3827f 100644 --- a/vortex-row/src/options.rs +++ b/vortex-row/src/options.rs @@ -6,35 +6,26 @@ use std::fmt::Formatter; use smallvec::SmallVec; -/// Per-column options for the row-oriented byte encoder. +/// Per-column ordering options for row-oriented encoding. /// -/// These options control how a single column is encoded into row bytes: -/// - `descending`: if true, the encoded value bytes are bit-inverted so that -/// lexicographic byte comparison reflects the reverse of the natural ordering. -/// The null sentinel byte is NOT inverted, so nulls keep their requested -/// position relative to non-nulls. -/// - `nulls_first`: if true, nulls sort before non-nulls. If false, nulls sort -/// after non-nulls. Implemented via the sentinel byte that precedes every -/// value's encoded bytes. +/// A `RowSortField` describes how one input column contributes to a row key. Descending order +/// reverses the encoded value bytes for that column. Null placement is controlled separately, +/// so nulls keep the requested position relative to non-null values in either direction. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub struct SortField { - /// If true, encoded value bytes are bit-inverted so lexicographic byte - /// comparison reflects the reverse of the natural ordering. +pub struct RowSortField { + /// If true, this column sorts in descending order. pub descending: bool, - /// If true, nulls sort before non-null values; otherwise nulls sort after. + /// If true, nulls sort before non-null values. pub nulls_first: bool, } -impl Default for SortField { +impl Default for RowSortField { fn default() -> Self { - Self { - descending: false, - nulls_first: true, - } + Self::ascending() } } -impl Display for SortField { +impl Display for RowSortField { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!( f, @@ -44,25 +35,47 @@ impl Display for SortField { } } -impl SortField { - /// Construct a new `SortField` with explicit options. - pub fn new(descending: bool, nulls_first: bool) -> Self { +impl RowSortField { + /// Construct a new `RowSortField` with explicit options. + pub const fn new(descending: bool, nulls_first: bool) -> Self { Self { descending, nulls_first, } } + /// Construct an ascending field with nulls first. + pub const fn ascending() -> Self { + Self::new(false, true) + } + + /// Construct a descending field with nulls first. + pub const fn descending() -> Self { + Self::new(true, true) + } + + /// Return this field with nulls ordered before non-null values. + pub const fn nulls_first(mut self) -> Self { + self.nulls_first = true; + self + } + + /// Return this field with nulls ordered after non-null values. + pub const fn nulls_last(mut self) -> Self { + self.nulls_first = false; + self + } + /// Returns the sentinel byte to write for a non-null value. #[inline] - pub fn non_null_sentinel(&self) -> u8 { + pub(crate) fn non_null_sentinel(&self) -> u8 { // Non-null is always 0x01. Null choices are < or > 0x01. 0x01 } /// Returns the sentinel byte to write for a null value. #[inline] - pub fn null_sentinel(&self) -> u8 { + pub(crate) fn null_sentinel(&self) -> u8 { if self.nulls_first { // Nulls before non-nulls (smaller byte sorts first). 0x00 @@ -73,34 +86,55 @@ impl SortField { } } -/// Inline capacity for [`RowEncodeOptions::fields`]. Up to this many [`SortField`]s -/// are held inline without a heap allocation; beyond, the storage spills. -pub const FIELDS_INLINE: usize = 4; +const FIELDS_INLINE: usize = 4; -/// Options for the variadic [`RowSize`] and [`RowEncode`] scalar functions: -/// one [`SortField`] per input column. -/// -/// Stored in a [`SmallVec`] so that typical 1–4 column keys avoid a heap -/// allocation; longer field lists spill to the heap transparently. +/// Ordering options for row-oriented encoding. /// -/// [`RowSize`]: super::size::RowSize -/// [`RowEncode`]: super::encode::RowEncode +/// The options contain one [`RowSortField`] per input column, in the same order as the columns +/// passed to [`convert_columns`](crate::convert_columns), +/// [`compute_row_sizes`](crate::compute_row_sizes), [`RowSize`](crate::RowSize), or +/// [`RowEncode`](crate::RowEncode). #[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct RowEncodeOptions { - /// Per-column sort fields, in left-to-right column order. - pub fields: SmallVec<[SortField; FIELDS_INLINE]>, +pub struct RowEncodingOptions { + pub(crate) fields: SmallVec<[RowSortField; FIELDS_INLINE]>, } -impl RowEncodeOptions { - /// Construct a new `RowEncodeOptions` from any iterator of [`SortField`]s. - pub fn new(fields: impl IntoIterator) -> Self { +impl RowEncodingOptions { + /// Construct a new `RowEncodingOptions` from any iterator of [`RowSortField`]s. + pub fn new(fields: impl IntoIterator) -> Self { Self { fields: fields.into_iter().collect(), } } + + /// Construct default ascending, nulls-first options for `column_count` input columns. + pub fn default_for_columns(column_count: usize) -> Self { + Self::new(std::iter::repeat_n(RowSortField::default(), column_count)) + } + + /// Borrow the per-column sort fields. + pub fn fields(&self) -> &[RowSortField] { + &self.fields + } + + /// Return the number of input columns described by these options. + pub fn len(&self) -> usize { + self.fields.len() + } + + /// Return true when the options do not describe any input columns. + pub fn is_empty(&self) -> bool { + self.fields.is_empty() + } +} + +impl FromIterator for RowEncodingOptions { + fn from_iter>(iter: T) -> Self { + Self::new(iter) + } } -impl Display for RowEncodeOptions { +impl Display for RowEncodingOptions { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "[")?; for (i, field) in self.fields.iter().enumerate() { @@ -113,12 +147,12 @@ impl Display for RowEncodeOptions { } } -/// Serialize a [`RowEncodeOptions`] to a compact byte vector: 4-byte LE length followed by +/// Serialize a [`RowEncodingOptions`] to a compact byte vector: 4-byte LE length followed by /// `2 * len` bytes (descending + nulls_first booleans for each field). -pub(crate) fn serialize_row_encode_options(opts: &RowEncodeOptions) -> Vec { +pub(crate) fn serialize_row_encoding_options(opts: &RowEncodingOptions) -> Vec { use vortex_error::VortexExpect; let n = - u32::try_from(opts.fields.len()).vortex_expect("RowEncodeOptions length must fit in u32"); + u32::try_from(opts.fields.len()).vortex_expect("RowEncodingOptions length must fit in u32"); let mut out = Vec::with_capacity(4 + 2 * opts.fields.len()); out.extend_from_slice(&n.to_le_bytes()); for f in &opts.fields { @@ -128,30 +162,32 @@ pub(crate) fn serialize_row_encode_options(opts: &RowEncodeOptions) -> Vec { out } -/// Deserialize a [`RowEncodeOptions`] produced by [`serialize_row_encode_options`]. -pub(crate) fn deserialize_row_encode_options( +/// Deserialize a [`RowEncodingOptions`] produced by [`serialize_row_encoding_options`]. +pub(crate) fn deserialize_row_encoding_options( bytes: &[u8], -) -> vortex_error::VortexResult { +) -> vortex_error::VortexResult { if bytes.len() < 4 { - vortex_error::vortex_bail!("RowEncodeOptions metadata must contain a 4-byte length prefix"); + vortex_error::vortex_bail!( + "RowEncodingOptions metadata must contain a 4-byte length prefix" + ); } let n = u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize; let expected = 4 + 2 * n; if bytes.len() != expected { vortex_error::vortex_bail!( - "RowEncodeOptions metadata wrong size: got {}, expected {}", + "RowEncodingOptions metadata wrong size: got {}, expected {}", bytes.len(), expected ); } - let mut fields: SmallVec<[SortField; FIELDS_INLINE]> = SmallVec::with_capacity(n); + let mut fields: SmallVec<[RowSortField; FIELDS_INLINE]> = SmallVec::with_capacity(n); let mut i = 4; for _ in 0..n { - fields.push(SortField { + fields.push(RowSortField { descending: bytes[i] != 0, nulls_first: bytes[i + 1] != 0, }); i += 2; } - Ok(RowEncodeOptions { fields }) + Ok(RowEncodingOptions { fields }) } diff --git a/vortex-row/src/size.rs b/vortex-row/src/size.rs index 7148a2a21d8..48d4f8e4dbc 100644 --- a/vortex-row/src/size.rs +++ b/vortex-row/src/size.rs @@ -6,11 +6,9 @@ use std::sync::Arc; use vortex_array::ArrayRef; -use vortex_array::ArrayView; use vortex_array::Canonical; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; -use vortex_array::VTable; use vortex_array::arrays::ConstantArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::StructArray; @@ -35,53 +33,16 @@ use vortex_session::VortexSession; use crate::codec; use crate::codec::RowWidth; -use crate::options::RowEncodeOptions; -use crate::options::SortField; -use crate::options::deserialize_row_encode_options; -use crate::options::serialize_row_encode_options; - -/// Classification of a single input column for the size pass. -/// -/// Tracks each column's within-row byte offset (the constant prefix from all preceding -/// fixed-width columns) and, for fixed columns, whether any variable-length column has -/// appeared yet — the encode pass uses this to choose between the arithmetic-write fast -/// path (no varlen before this column, so the within-row position is constant) and the -/// cursor-write path. -#[derive(Clone, Copy, Debug)] -#[allow( - dead_code, - reason = "fields read by the RowEncode pipeline in a later commit" -)] -pub(crate) enum ColKind { - /// Column has fixed width `width`. `prefix` is the within-row byte offset of this - /// column's first byte. If `before_varlen` is true, no variable-length column precedes - /// this one, so the within-row offset is constant for every row. - Fixed { - width: u32, - prefix: u32, - before_varlen: bool, - }, - /// Column has variable per-row width. `fixed_prefix` is the sum of widths of all - /// preceding fixed columns; the varlen contribution from earlier varlen columns is - /// added per row. - Variable { fixed_prefix: u32 }, -} +use crate::options::RowEncodingOptions; +use crate::options::RowSortField; +use crate::options::deserialize_row_encoding_options; +use crate::options::serialize_row_encoding_options; /// Result of the size pass: enough information for both [`RowSize::execute`] and the /// downstream [`RowEncode`](super::encode::RowEncode) pipeline. pub(crate) struct SizePassResult { pub fixed_per_row: u32, pub var_lengths: Option>, - #[allow( - dead_code, - reason = "consumed by the arithmetic-write fast path added in PR 2" - )] - pub col_kinds: Vec, - #[allow( - dead_code, - reason = "consumed by the arithmetic-write fast path added in PR 2" - )] - pub first_varlen_idx: Option, pub columns: Vec, } @@ -94,40 +55,34 @@ pub(crate) struct SizePassResult { /// /// This is shared by [`RowSize::execute`] (which wraps the result into a /// `Struct { fixed, var }`) and the [`RowEncode`](super::encode::RowEncode) pipeline -/// (which uses the full result, including `col_kinds`, to drive the encode pass). +/// (which reuses the canonicalized columns for the encode pass). pub(crate) fn compute_sizes( - options: &RowEncodeOptions, + options: &RowEncodingOptions, args: &dyn ExecutionArgs, ctx: &mut ExecutionCtx, - op_name: &'static str, ) -> VortexResult { let n_inputs = args.num_inputs(); if n_inputs == 0 { - vortex_bail!("{} requires at least one input column", op_name); + vortex_bail!("at least one input column is required"); } - if options.fields.len() != n_inputs { + if options.len() != n_inputs { vortex_bail!( - "{} options.fields.len()={} does not match num_inputs={}", - op_name, - options.fields.len(), + "options len ({}) does not match num_inputs ({})", + options.len(), n_inputs ); } let nrows = args.row_count(); let mut columns: Vec = Vec::with_capacity(n_inputs); - let mut col_kinds: Vec = Vec::with_capacity(n_inputs); let mut fixed_per_row: u32 = 0; let mut var_lengths: Option> = None; - let mut first_varlen_idx: Option = None; - let mut running_fixed_prefix: u32 = 0; for i in 0..n_inputs { let col = args.get(i)?; if col.len() != nrows { vortex_bail!( - "{}: column {} has length {} but expected {}", - op_name, + "column {} has length {} but expected {}", i, col.len(), nrows @@ -135,27 +90,13 @@ pub(crate) fn compute_sizes( } match codec::row_width_for_dtype(col.dtype())? { RowWidth::Fixed(w) => { - col_kinds.push(ColKind::Fixed { - width: w, - prefix: running_fixed_prefix, - before_varlen: first_varlen_idx.is_none(), - }); fixed_per_row = fixed_per_row .checked_add(w) .vortex_expect("row width overflow"); - running_fixed_prefix = running_fixed_prefix - .checked_add(w) - .vortex_expect("row width overflow"); } RowWidth::Variable => { - if first_varlen_idx.is_none() { - first_varlen_idx = Some(i); - } let v = var_lengths.get_or_insert_with(|| vec![0u32; nrows]); dispatch_size(&col, options.fields[i], v, ctx)?; - col_kinds.push(ColKind::Variable { - fixed_prefix: running_fixed_prefix, - }); } } columns.push(col); @@ -164,13 +105,11 @@ pub(crate) fn compute_sizes( Ok(SizePassResult { fixed_per_row, var_lengths, - col_kinds, - first_varlen_idx, columns, }) } -/// Variadic scalar function that, given N input columns and per-column [`SortField`]s, +/// Variadic scalar function that, given N input columns and per-column [`RowSortField`]s, /// returns a `Struct { fixed: U32, var: U32 }` array of per-row byte sizes for the /// row-oriented encoding produced by [`RowEncode`](super::encode::RowEncode). /// @@ -180,6 +119,10 @@ pub(crate) fn compute_sizes( /// [`PrimitiveArray`] of per-row varlen-byte sums otherwise. /// /// The total per-row byte size is `fixed + var`. +/// +/// This scalar function is public for session registration and encoding extension work. +/// Most callers should use [`RowEncoder::row_sizes`](crate::RowEncoder::row_sizes) rather +/// than invoking the scalar function directly. #[derive(Clone, Debug)] pub struct RowSize; @@ -203,14 +146,14 @@ pub(crate) fn row_size_struct_dtype() -> DType { } impl ScalarFnVTable for RowSize { - type Options = RowEncodeOptions; + type Options = RowEncodingOptions; fn id(&self) -> ScalarFnId { ScalarFnId::from("vortex.row_size") } fn serialize(&self, options: &Self::Options) -> VortexResult>> { - Ok(Some(serialize_row_encode_options(options))) + Ok(Some(serialize_row_encoding_options(options))) } fn deserialize( @@ -218,7 +161,7 @@ impl ScalarFnVTable for RowSize { metadata: &[u8], _session: &VortexSession, ) -> VortexResult { - deserialize_row_encode_options(metadata) + deserialize_row_encoding_options(metadata) } fn arity(&self, _options: &Self::Options) -> Arity { @@ -240,7 +183,7 @@ impl ScalarFnVTable for RowSize { ctx: &mut ExecutionCtx, ) -> VortexResult { let nrows = args.row_count(); - let result = compute_sizes(options, args, ctx, "RowSize")?; + let result = compute_sizes(options, args, ctx)?; let fixed_array = ConstantArray::new(Scalar::from(result.fixed_per_row), nrows).into_array(); let var_array = match result.var_lengths { @@ -266,31 +209,16 @@ impl ScalarFnVTable for RowSize { } } -/// Dispatch a single column's per-row size contribution. +/// Dispatch a single column's per-row size contribution through the canonical path. /// -/// For PR 1 this is just the canonicalize-then-`codec::field_size` fallback path. In-crate -/// fast paths for `Constant`/`Dict`/`Patched` and the inventory-based registry for -/// downstream encodings are added in PR 3. -pub fn dispatch_size( +/// TODO(row): add per-encoding fast paths here so Constant, Dictionary, and compressed arrays +/// can contribute row sizes without canonicalizing. +pub(crate) fn dispatch_size( col: &ArrayRef, - field: SortField, + field: RowSortField, sizes: &mut [u32], ctx: &mut ExecutionCtx, ) -> VortexResult<()> { let canonical = col.clone().execute::(ctx)?; codec::field_size(&canonical, field, sizes, ctx) } - -/// Mutate-buffer kernel: add this column's per-row byte contribution into the shared -/// `sizes` slice. Return `Ok(None)` to decline and fall back to the canonical path. -/// -/// Trait is defined now; per-encoding impls and dispatch wiring land in PR 3. -pub trait RowSizeKernel: VTable { - /// Add this column's per-row byte contribution into `sizes`. - fn row_size_contribution( - column: ArrayView<'_, Self>, - field: SortField, - sizes: &mut [u32], - ctx: &mut ExecutionCtx, - ) -> VortexResult>; -} diff --git a/vortex-row/src/tests.rs b/vortex-row/src/tests.rs index ff7d8fb274a..a1eaadf3803 100644 --- a/vortex-row/src/tests.rs +++ b/vortex-row/src/tests.rs @@ -1,15 +1,10 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -#![allow( - clippy::approx_constant, - clippy::cloned_ref_to_slice_refs, - clippy::redundant_clone, - reason = "tests value clarity over micro-optimization" -)] - //! Tests for the row encoder. +use std::f64::consts::PI; + use rstest::rstest; use vortex_array::IntoArray; use vortex_array::LEGACY_SESSION; @@ -21,8 +16,12 @@ use vortex_array::arrays::VarBinViewArray; use vortex_array::arrays::listview::ListViewArrayExt; use vortex_error::VortexResult; -use crate::SortField; +use crate::RowEncoder; +use crate::RowEncodingOptions; +use crate::RowSortField; +use crate::compute_row_sizes_with_options; use crate::convert_columns; +use crate::convert_columns_with_options; fn collect_row_bytes(array: &ListViewArray) -> Vec> { let mut ctx = LEGACY_SESSION.create_execution_ctx(); @@ -41,10 +40,7 @@ fn collect_row_bytes(array: &ListViewArray) -> Vec> { fn assert_sort_order_i64(values: Vec, descending: bool) -> VortexResult<()> { let mut ctx = LEGACY_SESSION.create_execution_ctx(); let col = PrimitiveArray::from_iter(values.clone()).into_array(); - let field = SortField { - descending, - nulls_first: true, - }; + let field = RowSortField::new(descending, true); let encoded = convert_columns(&[col], &[field], &mut ctx)?; let rows = collect_row_bytes(&encoded); @@ -57,7 +53,7 @@ fn assert_sort_order_i64(values: Vec, descending: bool) -> VortexResult<()> } let expected_order: Vec> = idx.iter().map(|&i| rows[i].clone()).collect(); - let mut sorted = rows.clone(); + let mut sorted = rows; sorted.sort(); assert_eq!( sorted, expected_order, @@ -79,7 +75,7 @@ fn primitive_u32_sort_order() -> VortexResult<()> { let mut ctx = LEGACY_SESSION.create_execution_ctx(); let values: Vec = vec![0, 1, 100, u32::MAX, 42, 17]; let col = PrimitiveArray::from_iter(values.clone()).into_array(); - let encoded = convert_columns(&[col], &[SortField::default()], &mut ctx)?; + let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?; let rows = collect_row_bytes(&encoded); let mut sorted_rows = rows.clone(); @@ -98,9 +94,9 @@ fn primitive_f64_sort_order() -> VortexResult<()> { // We use IEEE total-ordering semantics: -0.0 < +0.0 in the byte encoding (matches // `arrow-row`). Avoid -0.0 in the natural-order baseline since partial_cmp says // -0.0 == 0.0. - let values: Vec = vec![-1.5, 0.0, 1.5, f64::INFINITY, f64::NEG_INFINITY, 3.14]; + let values: Vec = vec![-1.5, 0.0, 1.5, f64::INFINITY, f64::NEG_INFINITY, PI]; let col = PrimitiveArray::from_iter(values.clone()).into_array(); - let encoded = convert_columns(&[col], &[SortField::default()], &mut ctx)?; + let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?; let rows = collect_row_bytes(&encoded); let mut sorted_rows = rows.clone(); @@ -117,7 +113,7 @@ fn primitive_f64_sort_order() -> VortexResult<()> { fn bool_sort_order() -> VortexResult<()> { let mut ctx = LEGACY_SESSION.create_execution_ctx(); let col = BoolArray::from_iter([true, false, true, false]).into_array(); - let encoded = convert_columns(&[col], &[SortField::default()], &mut ctx)?; + let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?; let rows = collect_row_bytes(&encoded); let mut sorted = rows.clone(); @@ -142,7 +138,7 @@ fn utf8_sort_order() -> VortexResult<()> { "banana_loaf_for_test", ]; let col = VarBinViewArray::from_iter_str(values.clone()).into_array(); - let encoded = convert_columns(&[col], &[SortField::default()], &mut ctx)?; + let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?; let rows = collect_row_bytes(&encoded); let mut sorted = rows.clone(); @@ -164,7 +160,7 @@ fn multi_column_sort() -> VortexResult<()> { let col1 = VarBinViewArray::from_iter_str(strs.clone()).into_array(); let encoded = convert_columns( &[col0, col1], - &[SortField::default(), SortField::default()], + &[RowSortField::default(), RowSortField::default()], &mut ctx, )?; let rows = collect_row_bytes(&encoded); @@ -186,15 +182,12 @@ fn nulls_first_and_last() -> VortexResult<()> { // nulls_first=true let encoded = convert_columns( - &[col.clone()], - &[SortField { - descending: false, - nulls_first: true, - }], + std::slice::from_ref(&col), + &[RowSortField::ascending()], &mut ctx, )?; let rows = collect_row_bytes(&encoded); - let mut sorted = rows.clone(); + let mut sorted = rows; sorted.sort(); // The first two sorted entries should be nulls let null_count = values.iter().filter(|v| v.is_none()).count(); @@ -203,16 +196,9 @@ fn nulls_first_and_last() -> VortexResult<()> { assert_eq!(sorted[i][0], 0x00); } // nulls_first=false - let encoded = convert_columns( - &[col], - &[SortField { - descending: false, - nulls_first: false, - }], - &mut ctx, - )?; + let encoded = convert_columns(&[col], &[RowSortField::ascending().nulls_last()], &mut ctx)?; let rows = collect_row_bytes(&encoded); - let mut sorted = rows.clone(); + let mut sorted = rows; sorted.sort(); // The last two sorted entries should be nulls for i in 0..null_count { @@ -222,6 +208,60 @@ fn nulls_first_and_last() -> VortexResult<()> { Ok(()) } +#[test] +fn reusable_options_helpers() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let options = RowEncodingOptions::new([RowSortField::descending().nulls_last()]); + assert_eq!(options.len(), 1); + assert!(!options.is_empty()); + assert_eq!( + options.fields(), + &[RowSortField { + descending: true, + nulls_first: false + }] + ); + + let col = PrimitiveArray::from_iter([1i32, 2, 3]).into_array(); + let encoder = RowEncoder::with_options(options.clone()); + assert_eq!(encoder.options(), Some(&options)); + + let encoded = encoder.encode(std::slice::from_ref(&col), &mut ctx)?; + assert_eq!(encoded.len(), 3); + + let sizes = encoder.row_sizes(std::slice::from_ref(&col), &mut ctx)?; + assert_eq!(sizes.len(), 3); + + let encoded = convert_columns_with_options(std::slice::from_ref(&col), &options, &mut ctx)?; + assert_eq!(encoded.len(), 3); + + let sizes = compute_row_sizes_with_options(std::slice::from_ref(&col), &options, &mut ctx)?; + assert_eq!(sizes.len(), 3); + Ok(()) +} + +#[test] +fn row_encoder_new_accepts_sort_fields() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let encoder = RowEncoder::new([RowSortField::ascending()]); + let col = PrimitiveArray::from_iter([1i32, 2, 3]).into_array(); + + let encoded = encoder.encode(std::slice::from_ref(&col), &mut ctx)?; + assert_eq!(encoded.len(), 3); + Ok(()) +} + +#[test] +fn default_row_encoder_uses_default_fields() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let col0 = PrimitiveArray::from_iter([1i32, 2, 3]).into_array(); + let col1 = PrimitiveArray::from_iter([4i32, 5, 6]).into_array(); + + let encoded = RowEncoder::default().encode(&[col0, col1], &mut ctx)?; + assert_eq!(encoded.len(), 3); + Ok(()) +} + #[test] fn struct_sort_order() -> VortexResult<()> { use vortex_array::arrays::StructArray; @@ -232,7 +272,7 @@ fn struct_sort_order() -> VortexResult<()> { let name_arr = VarBinViewArray::from_iter_str(names.clone()).into_array(); let struct_arr = StructArray::from_fields(&[("id", id_arr), ("name", name_arr)])?.into_array(); - let encoded = convert_columns(&[struct_arr], &[SortField::default()], &mut ctx)?; + let encoded = convert_columns(&[struct_arr], &[RowSortField::default()], &mut ctx)?; let rows = collect_row_bytes(&encoded); let mut sorted = rows.clone(); @@ -260,7 +300,7 @@ fn row_size_struct_shape() -> VortexResult<()> { let sizes = compute_row_sizes( &[col0, col1], - &[SortField::default(), SortField::default()], + &[RowSortField::default(), RowSortField::default()], &mut ctx, )?; // Shape must be Struct { fixed, var } @@ -299,11 +339,11 @@ fn single_buffer_invariant() -> VortexResult<()> { let strings: Vec = (0..nrows) .map(|i| format!("row_{}_with_padding", i)) .collect(); - let col0 = PrimitiveArray::from_iter(primitives.clone()).into_array(); + let col0 = PrimitiveArray::from_iter(primitives).into_array(); let col1 = VarBinViewArray::from_iter_str(strings.iter().map(String::as_str)).into_array(); let encoded = convert_columns( &[col0, col1], - &[SortField::default(), SortField::default()], + &[RowSortField::default(), RowSortField::default()], &mut ctx, )?; From 48f59cef9066fd7771e4b8f6c2f6b51fc6d1fb39 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 22 May 2026 15:49:35 +0100 Subject: [PATCH 10/10] t Signed-off-by: Joe Isaacs --- vortex-row/src/codec.rs | 485 +++++++++++++++++++++++++++++---------- vortex-row/src/encode.rs | 87 +++---- vortex-row/src/size.rs | 42 ++-- vortex-row/src/tests.rs | 215 ++++++++++++++++- 4 files changed, 626 insertions(+), 203 deletions(-) diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs index 33270b0ad43..2818db62aba 100644 --- a/vortex-row/src/codec.rs +++ b/vortex-row/src/codec.rs @@ -9,12 +9,20 @@ //! descending-ness as configured by [`RowSortField`]. //! //! Conventions: -//! - Every value is preceded by a 1-byte sentinel that orders nulls relative to non-nulls. -//! - For `descending`, only the **value** bytes are bit-inverted (XOR with 0xFF), not the -//! sentinel. +//! - Every fixed-width value is preceded by a 1-byte sentinel that orders nulls relative to +//! non-nulls. For `descending`, only the **value** bytes are bit-inverted (XOR with 0xFF), +//! not the sentinel. +//! - Variable-length (Utf8, Binary) values use **three** distinct leading sentinels — one each +//! for null, empty, and non-empty — so byte comparison at position 0 fully categorizes the +//! value and column-byte boundaries stay aligned across rows. See +//! [`varlen_null_sentinel`], [`varlen_empty_sentinel`], [`varlen_non_empty_sentinel`]. //! - Fixed-width integers are big-endian, with the sign bit flipped for signed types. //! - Floats are bit-pattern big-endian with sign-aware mask: non-negative flips the top //! bit; negative flips all bits. +//! - Nullable structs and fixed-size lists encode null parent rows with a **canonical null +//! body** so two null parent rows produce byte-equal encodings: fixed-width children +//! contribute their fixed null encoding, and variable-width children collapse to a single +//! null sentinel byte. use vortex_array::Canonical; use vortex_array::ExecutionCtx; @@ -50,17 +58,23 @@ pub(crate) const VARLEN_BLOCK_SIZE: usize = 32; pub(crate) const VARLEN_BLOCK_TOTAL: usize = VARLEN_BLOCK_SIZE + 1; const VARLEN_BLOCK_TOTAL_U32: u32 = 33; -/// Returns the size in bytes of the encoded form of a variable-length value of the given length. +/// Size in bytes of an encoded null varlen value (just the sentinel byte). +pub(crate) const VARLEN_NULL_SIZE: u32 = 1; +/// Size in bytes of an encoded empty varlen value (just the sentinel byte). +pub(crate) const VARLEN_EMPTY_SIZE: u32 = 1; + +/// Returns the size in bytes of the encoded form of a non-empty variable-length value. +/// +/// Includes the leading sentinel byte plus `ceil(len/32) * 33` block bytes (32 content + 1 +/// continuation/length byte). Callers must use [`VARLEN_NULL_SIZE`] for null values and +/// [`VARLEN_EMPTY_SIZE`] for empty values. A `u32` always suffices because a `BinaryView` +/// length is itself a `u32`, so `blocks <= ceil(u32::MAX / 32) < 2^27`. #[inline] -fn encoded_size_for_varlen(len: usize) -> u32 { - // 1 sentinel + ceil(len/32)*33 content bytes (or 1 zero terminator if empty) - if len == 0 { - 1 + 1 - } else { - let blocks = u32::try_from(len.div_ceil(VARLEN_BLOCK_SIZE)) - .vortex_expect("varlen block count must fit in u32"); - 1 + blocks * VARLEN_BLOCK_TOTAL_U32 - } +fn encoded_size_for_non_empty_varlen(len: usize) -> u32 { + debug_assert!(len > 0); + let blocks = u32::try_from(len.div_ceil(VARLEN_BLOCK_SIZE)) + .vortex_expect("varlen block count must fit in u32"); + 1 + blocks * VARLEN_BLOCK_TOTAL_U32 } /// Constant per-row size in bytes for fixed-width encodings (including 1-byte sentinel). @@ -73,6 +87,43 @@ fn byte_width_u32(width: usize) -> u32 { u32::try_from(width).vortex_expect("native byte width must fit in u32") } +/// Returns the sentinel byte for a null varlen value. +/// +/// The choice is positional (0x00 when nulls sort first, 0xFF when nulls sort last) and +/// independent of `descending`, matching the convention used by `arrow-row`. +#[inline] +fn varlen_null_sentinel(field: RowSortField) -> u8 { + if field.nulls_first { 0x00 } else { 0xFF } +} + +/// Returns the sentinel byte for an empty varlen value. +/// +/// Equal to `0x01` in ascending mode and `!0x01 = 0xFE` in descending mode. +#[inline] +fn varlen_empty_sentinel(field: RowSortField) -> u8 { + if field.descending { !0x01u8 } else { 0x01u8 } +} + +/// Returns the sentinel byte for a non-empty varlen value. +/// +/// Equal to `0x02` in ascending mode and `!0x02 = 0xFD` in descending mode. +#[inline] +fn varlen_non_empty_sentinel(field: RowSortField) -> u8 { + if field.descending { !0x02u8 } else { 0x02u8 } +} + +/// Returns the single-byte null sentinel used when a child contributes its canonical null +/// encoding inside a null parent struct/FSL row. +/// +/// For varlen children that is the varlen null sentinel; for everything else (including +/// nested struct/FSL when used as a variable-width child) it is the fixed-width null sentinel. +fn child_canonical_null_byte(child_dtype: &DType, field: RowSortField) -> u8 { + match child_dtype { + DType::Utf8(_) | DType::Binary(_) => varlen_null_sentinel(field), + _ => field.null_sentinel(), + } +} + /// Per-row width classification for a column. /// /// `Fixed(w)` means every row encodes to exactly `w` bytes (sentinel + value), regardless @@ -97,7 +148,8 @@ pub(crate) enum RowWidth { /// /// # Errors /// -/// Returns an error for dtypes that the row encoder does not support. +/// Returns an error for dtypes that the row encoder does not support. Width arithmetic that +/// would overflow `u32` is also reported as an error rather than silently saturating. pub(crate) fn row_width_for_dtype(dtype: &DType) -> VortexResult { match dtype { DType::Null => Ok(RowWidth::Fixed(1)), @@ -107,6 +159,9 @@ pub(crate) fn row_width_for_dtype(dtype: &DType) -> VortexResult { )))), DType::Decimal(dt, _) => { let vt = DecimalType::smallest_decimal_value_type(dt); + if matches!(vt, DecimalType::I256) { + vortex_bail!("row encoding for Decimal256 is not yet implemented"); + } Ok(RowWidth::Fixed(encoded_size_for_fixed(byte_width_u32( vt.byte_width(), )))) @@ -116,8 +171,13 @@ pub(crate) fn row_width_for_dtype(dtype: &DType) -> VortexResult { // FSL is fixed iff its element type is fixed. Add a sentinel byte for the FSL // itself, then `n` copies of the element width. RowWidth::Fixed(w) => { - let body = w.saturating_mul(*n); - Ok(RowWidth::Fixed(body.saturating_add(1))) + let body = w + .checked_mul(*n) + .ok_or_else(|| vortex_error::vortex_err!("FSL row width overflows u32"))?; + let total = body + .checked_add(1) + .ok_or_else(|| vortex_error::vortex_err!("FSL row width overflows u32"))?; + Ok(RowWidth::Fixed(total)) } RowWidth::Variable => Ok(RowWidth::Variable), }, @@ -126,13 +186,21 @@ pub(crate) fn row_width_for_dtype(dtype: &DType) -> VortexResult { let mut total: u32 = 1; // outer sentinel for field_dtype in fields.fields() { match row_width_for_dtype(&field_dtype)? { - RowWidth::Fixed(w) => total = total.saturating_add(w), + RowWidth::Fixed(w) => { + total = total.checked_add(w).ok_or_else(|| { + vortex_error::vortex_err!("Struct row width overflows u32") + })?; + } RowWidth::Variable => return Ok(RowWidth::Variable), } } Ok(RowWidth::Fixed(total)) } - DType::List(..) => Ok(RowWidth::Variable), + DType::List(..) => { + vortex_bail!( + "row encoding does not support variable-size List arrays (no well-defined ordering)" + ) + } DType::Extension(ext) => row_width_for_dtype(ext.storage_dtype()), DType::Variant(_) => { vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)") @@ -241,13 +309,16 @@ fn add_size_varbinview( let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?; let views = arr.views(); for (i, view) in views.iter().enumerate() { - let valid = mask.value(i); - if !valid { - sizes[i] += 1; // sentinel only + let contribution = if !mask.value(i) { + VARLEN_NULL_SIZE + } else if view.is_empty() { + VARLEN_EMPTY_SIZE } else { - let len = view.len() as usize; - sizes[i] += encoded_size_for_varlen(len); - } + encoded_size_for_non_empty_varlen(view.len() as usize) + }; + sizes[i] = sizes[i] + .checked_add(contribution) + .vortex_expect("per-row size overflow"); } Ok(()) } @@ -258,14 +329,31 @@ fn add_size_struct( sizes: &mut [u32], ctx: &mut ExecutionCtx, ) -> VortexResult<()> { - // null sentinel: 1 byte per row. + let n = arr.len(); + let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?; + // Outer sentinel: 1 byte per row. for s in sizes.iter_mut() { - *s += 1; + *s = s.checked_add(1).vortex_expect("per-row size overflow"); } - // Each field adds its own per-row size. + // Each child contributes its per-row size when the parent is non-null, and a canonical + // null contribution when the parent is null. For fixed-width children both are equal, + // so we can simply add the fixed width to every row. For variable-width children the + // null contribution collapses to 1 byte, ensuring null parent rows have a constant body. for child in arr.iter_unmasked_fields() { - let canonical = child.clone().execute::(ctx)?; - field_size(&canonical, field, sizes, ctx)?; + match row_width_for_dtype(child.dtype())? { + RowWidth::Fixed(w) => add_size_const(sizes, w), + RowWidth::Variable => { + let canonical = child.clone().execute::(ctx)?; + let mut child_sizes = vec![0u32; n]; + field_size(&canonical, field, &mut child_sizes, ctx)?; + for i in 0..n { + let contribution = if mask.value(i) { child_sizes[i] } else { 1u32 }; + sizes[i] = sizes[i] + .checked_add(contribution) + .vortex_expect("per-row size overflow"); + } + } + } } Ok(()) } @@ -279,19 +367,45 @@ fn add_size_fsl( let n = arr.len(); debug_assert_eq!(n, sizes.len()); let list_size = arr.list_size() as usize; - let elements = arr.elements().clone().execute::(ctx)?; - debug_assert_eq!(elements.len(), n * list_size); - // Sizing: 1 sentinel + sum of element sizes (`list_size` per row). - // We compute element-wise sizes into a contiguous scratch buffer then reduce by row. - let mut elem_sizes = vec![0u32; n * list_size]; - field_size(&elements, field, &mut elem_sizes, ctx)?; - for i in 0..n { - let mut sum: u32 = 1; // sentinel - let base = i * list_size; - for j in 0..list_size { - sum = sum.saturating_add(elem_sizes[base + j]); + let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?; + let elem_dtype = arr.elements().dtype(); + // Outer sentinel: 1 byte per row. + for s in sizes.iter_mut() { + *s = s.checked_add(1).vortex_expect("per-row size overflow"); + } + match row_width_for_dtype(elem_dtype)? { + RowWidth::Fixed(w) => { + // Each row has `list_size` fixed-width elements regardless of null parent mask. + let body = w + .checked_mul(u32::try_from(list_size).vortex_expect("list_size fits u32")) + .vortex_expect("FSL body width overflow"); + add_size_const(sizes, body); + } + RowWidth::Variable => { + let elements = arr.elements().clone().execute::(ctx)?; + debug_assert_eq!(elements.len(), n * list_size); + let mut elem_sizes = vec![0u32; n * list_size]; + field_size(&elements, field, &mut elem_sizes, ctx)?; + for i in 0..n { + let body: u32 = if mask.value(i) { + let base = i * list_size; + let mut sum: u32 = 0; + for j in 0..list_size { + sum = sum + .checked_add(elem_sizes[base + j]) + .vortex_expect("FSL row body overflow"); + } + sum + } else { + // Canonical null body for FSL with variable element: one null sentinel + // per element. (Each element contributes `child_null_width = 1`.) + u32::try_from(list_size).vortex_expect("list_size fits u32") + }; + sizes[i] = sizes[i] + .checked_add(body) + .vortex_expect("FSL per-row size overflow"); + } } - sizes[i] += sum; } Ok(()) } @@ -462,24 +576,33 @@ fn encode_varbinview( row_offsets: &[u32], col_offset: &mut [u32], out: &mut [u8], - ctx: &mut ExecutionCtx, + _ctx: &mut ExecutionCtx, ) -> VortexResult<()> { - let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?; - let non_null = field.non_null_sentinel(); - let null = field.null_sentinel(); + let null_byte = varlen_null_sentinel(field); + let empty_byte = varlen_empty_sentinel(field); + let non_empty_byte = varlen_non_empty_sentinel(field); + // `with_iterator` yields `Some(bytes)` for non-null rows and `None` for null rows, + // so the iterator alone fully describes validity — no separate mask lookup needed. arr.with_iterator(|iter| { for (i, maybe) in iter.enumerate() { let pos = (row_offsets[i] + col_offset[i]) as usize; - if !mask.value(i) { - out[pos] = null; - col_offset[i] += 1; - continue; + match maybe { + None => { + out[pos] = null_byte; + col_offset[i] += VARLEN_NULL_SIZE; + } + Some([]) => { + out[pos] = empty_byte; + col_offset[i] += VARLEN_EMPTY_SIZE; + } + Some(bytes) => { + out[pos] = non_empty_byte; + let written = + encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], field.descending); + col_offset[i] += 1 + written; + } } - let bytes: &[u8] = maybe.unwrap_or(&[]); - out[pos] = non_null; - let written = encode_varlen_value(bytes, &mut out[pos + 1..], field.descending); - col_offset[i] += 1 + written; } }); Ok(()) @@ -498,37 +621,37 @@ fn encode_struct( let non_null = field.non_null_sentinel(); let null = field.null_sentinel(); - // First, write the sentinel for each row. We track the post-sentinel cursor offsets - // for the body in `body_cursors` (which start exactly at +1 of the input cursor). - // For null rows we additionally need to zero-fill the (uniform-width) field bytes, - // but because struct widths are variable in general, we record null indexes first - // and zero-fill after we know each row's contribution. - // - // To keep the implementation simple we: - // 1) advance the cursor past the sentinel, - // 2) recursively encode each field's bytes (the field encoders ignore nullness of - // the struct, but use their own per-field nullness), - // 3) for null struct rows, overwrite the body bytes with zeros so the encoded form - // depends only on the sentinel. - let body_start: Vec = (0..n).map(|i| col_offset[i] + 1).collect(); + // Write the outer sentinel for each row. for i in 0..n { let pos = (row_offsets[i] + col_offset[i]) as usize; out[pos] = if mask.value(i) { non_null } else { null }; col_offset[i] += 1; } + // Encode each child. For non-null parent rows the child contributes its actual encoding; + // for null parent rows the child contributes its canonical null encoding so that two null + // parent rows produce byte-equal output regardless of underlying child values. for child in arr.iter_unmasked_fields() { - let canonical = child.clone().execute::(ctx)?; - field_encode(&canonical, field, row_offsets, col_offset, out, ctx)?; - } - - // Zero-fill body bytes of null rows (the field encoders may have written values). - for i in 0..n { - if !mask.value(i) { - let start = (row_offsets[i] + body_start[i]) as usize; - let end = (row_offsets[i] + col_offset[i]) as usize; - for b in &mut out[start..end] { - *b = 0; + match row_width_for_dtype(child.dtype())? { + RowWidth::Fixed(w) => { + let canonical = child.clone().execute::(ctx)?; + field_encode(&canonical, field, row_offsets, col_offset, out, ctx)?; + // Replace null parent rows with the canonical null encoding (the same as a + // child-level null: null sentinel followed by zero-padded value bytes). + let null_byte = child_canonical_null_byte(child.dtype(), field); + for i in 0..n { + if !mask.value(i) { + let end = (row_offsets[i] + col_offset[i]) as usize; + let start = end - w as usize; + out[start] = null_byte; + for b in &mut out[start + 1..end] { + *b = 0; + } + } + } + } + RowWidth::Variable => { + encode_variable_child(child, field, &mask, row_offsets, col_offset, out, ctx)?; } } } @@ -544,58 +667,181 @@ fn encode_fsl( out: &mut [u8], ctx: &mut ExecutionCtx, ) -> VortexResult<()> { - let n = arr.len(); + let nrows = arr.len(); let list_size = arr.list_size() as usize; - let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?; + let mask = arr.as_ref().validity()?.execute_mask(nrows, ctx)?; let non_null = field.non_null_sentinel(); let null = field.null_sentinel(); - let elements = arr.elements().clone().execute::(ctx)?; - debug_assert_eq!(elements.len(), n * list_size); + let elem_dtype = arr.elements().dtype().clone(); - // Write sentinels and remember body start for null zero-fill. - let body_start: Vec = (0..n).map(|i| col_offset[i] + 1).collect(); - for i in 0..n { + // Outer sentinel. + for i in 0..nrows { let pos = (row_offsets[i] + col_offset[i]) as usize; out[pos] = if mask.value(i) { non_null } else { null }; col_offset[i] += 1; } - // Encode all `n * list_size` elements into the body. Build a fresh - // (offsets, cursors) pair where each element gets one slot. Then sum bytes back - // into the parent col_offset. - let mut elem_sizes = vec![0u32; n * list_size]; - field_size(&elements, field, &mut elem_sizes, ctx)?; - // Element offsets are sequential starting at each parent's current cursor position. - let mut elem_offsets = vec![0u32; n * list_size]; - for i in 0..n { - let mut acc = row_offsets[i] + col_offset[i]; - for j in 0..list_size { - elem_offsets[i * list_size + j] = acc; - acc = acc.saturating_add(elem_sizes[i * list_size + j]); + match row_width_for_dtype(&elem_dtype)? { + RowWidth::Fixed(w) => { + // Fixed-width elements: encode the elements array directly (its length is + // nrows * list_size) using a derived (offsets, cursors) pair. Then overwrite + // the body of null parent rows with the canonical null encoding per element. + let elements = arr.elements().clone().execute::(ctx)?; + debug_assert_eq!(elements.len(), nrows * list_size); + let list_size_u32 = arr.list_size(); + let row_body_bytes = w + .checked_mul(list_size_u32) + .vortex_expect("FSL body width overflow"); + let mut elem_offsets = vec![0u32; nrows * list_size]; + for i in 0..nrows { + let base = row_offsets[i] + col_offset[i]; + for j in 0u32..list_size_u32 { + elem_offsets[i * list_size + j as usize] = base + j * w; + } + } + let mut elem_cursors = vec![0u32; nrows * list_size]; + field_encode(&elements, field, &elem_offsets, &mut elem_cursors, out, ctx)?; + for i in 0..nrows { + col_offset[i] = col_offset[i] + .checked_add(row_body_bytes) + .vortex_expect("FSL row body overflow"); + } + // Canonical null body for null parent rows: one null encoding per element. + let null_byte = child_canonical_null_byte(&elem_dtype, field); + let elem_width = w as usize; + for i in 0..nrows { + if !mask.value(i) { + let end = (row_offsets[i] + col_offset[i]) as usize; + let start = end - row_body_bytes as usize; + let mut pos = start; + for _ in 0..list_size { + out[pos] = null_byte; + for b in &mut out[pos + 1..pos + elem_width] { + *b = 0; + } + pos += elem_width; + } + } + } } - } - let mut elem_cursors = vec![0u32; n * list_size]; - field_encode(&elements, field, &elem_offsets, &mut elem_cursors, out, ctx)?; - // Advance the parent cursors by the total per-row element bytes. - for i in 0..n { - let mut sum: u32 = 0; - for j in 0..list_size { - sum = sum.saturating_add(elem_sizes[i * list_size + j]); + RowWidth::Variable => { + // Variable-width elements: for null parent rows the canonical body is exactly + // `list_size` null sentinel bytes (one per element). For non-null parent rows, + // encode each element via a scratch buffer and copy into out. + let elements = arr.elements().clone().execute::(ctx)?; + debug_assert_eq!(elements.len(), nrows * list_size); + let mut elem_sizes = vec![0u32; nrows * list_size]; + field_size(&elements, field, &mut elem_sizes, ctx)?; + let total: u64 = elem_sizes.iter().map(|&s| u64::from(s)).sum(); + let total_usize = + usize::try_from(total).vortex_expect("FSL scratch buffer size fits usize"); + let mut scratch = vec![0u8; total_usize]; + let mut scratch_offsets = Vec::with_capacity(nrows * list_size); + let mut acc: u32 = 0; + for &s in &elem_sizes { + scratch_offsets.push(acc); + acc = acc + .checked_add(s) + .vortex_expect("FSL scratch offset overflow"); + } + let mut scratch_cursors = vec![0u32; nrows * list_size]; + field_encode( + &elements, + field, + &scratch_offsets, + &mut scratch_cursors, + &mut scratch, + ctx, + )?; + let null_byte = child_canonical_null_byte(&elem_dtype, field); + for i in 0..nrows { + let dst = (row_offsets[i] + col_offset[i]) as usize; + if mask.value(i) { + let mut body_bytes: u32 = 0; + for j in 0..list_size { + let k = i * list_size + j; + let src = scratch_offsets[k] as usize; + let sz = elem_sizes[k] as usize; + out[dst + body_bytes as usize..dst + body_bytes as usize + sz] + .copy_from_slice(&scratch[src..src + sz]); + body_bytes = body_bytes + .checked_add(elem_sizes[k]) + .vortex_expect("FSL body bytes overflow"); + } + col_offset[i] = col_offset[i] + .checked_add(body_bytes) + .vortex_expect("FSL row offset overflow"); + } else { + for offset in 0..list_size { + out[dst + offset] = null_byte; + } + col_offset[i] = col_offset[i] + .checked_add(u32::try_from(list_size).vortex_expect("list_size fits u32")) + .vortex_expect("FSL row offset overflow"); + } + } } - col_offset[i] = col_offset[i].saturating_add(sum); } - // Zero-fill null bodies. + Ok(()) +} + +/// Encode one variable-width child of a struct: for non-null parent rows, copy the child's +/// natural encoding from a scratch buffer; for null parent rows, write a single +/// `child_canonical_null_byte`. +fn encode_variable_child( + child: &vortex_array::ArrayRef, + field: RowSortField, + parent_mask: &vortex_mask::Mask, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let n = child.len(); + let canonical = child.clone().execute::(ctx)?; + + // Size and encode the child into a sequential scratch buffer. + let mut child_sizes = vec![0u32; n]; + field_size(&canonical, field, &mut child_sizes, ctx)?; + let total: u64 = child_sizes.iter().map(|&s| u64::from(s)).sum(); + let total_usize = usize::try_from(total).vortex_expect("child scratch buffer size fits usize"); + let mut scratch = vec![0u8; total_usize]; + let mut scratch_offsets = Vec::with_capacity(n); + let mut acc: u32 = 0; + for &s in &child_sizes { + scratch_offsets.push(acc); + acc = acc + .checked_add(s) + .vortex_expect("child scratch offset overflow"); + } + let mut scratch_cursors = vec![0u32; n]; + field_encode( + &canonical, + field, + &scratch_offsets, + &mut scratch_cursors, + &mut scratch, + ctx, + )?; + + let null_byte = child_canonical_null_byte(child.dtype(), field); for i in 0..n { - if !mask.value(i) { - let start = (row_offsets[i] + body_start[i]) as usize; - let end = (row_offsets[i] + col_offset[i]) as usize; - for b in &mut out[start..end] { - *b = 0; - } + let dst = (row_offsets[i] + col_offset[i]) as usize; + if parent_mask.value(i) { + let src = scratch_offsets[i] as usize; + let sz = child_sizes[i] as usize; + out[dst..dst + sz].copy_from_slice(&scratch[src..src + sz]); + col_offset[i] = col_offset[i] + .checked_add(child_sizes[i]) + .vortex_expect("col_offset overflow"); + } else { + out[dst] = null_byte; + col_offset[i] = col_offset[i] + .checked_add(1) + .vortex_expect("col_offset overflow"); } } - Ok(()) } @@ -611,15 +857,12 @@ fn encode_extension( field_encode(&storage, field, row_offsets, col_offset, out, ctx) } -/// Encode a variable-length byte slice into `out` in 32-byte blocks with -/// continuation markers. Returns the number of bytes written. -fn encode_varlen_value(bytes: &[u8], out: &mut [u8], descending: bool) -> u32 { +/// Encode a non-empty variable-length byte slice into `out` in 32-byte blocks with +/// continuation/length markers. Returns the number of bytes written. Empty values are +/// encoded by the caller as a single sentinel byte and never reach this function. +fn encode_non_empty_varlen_body(bytes: &[u8], out: &mut [u8], descending: bool) -> u32 { + debug_assert!(!bytes.is_empty()); let xor = if descending { 0xFFu8 } else { 0x00 }; - if bytes.is_empty() { - // Single zero terminator. - out[0] = xor; - return 1; - } let mut written = 0usize; let mut remaining = bytes; while remaining.len() > VARLEN_BLOCK_SIZE { diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs index 4bc4962503e..d3721e49a6e 100644 --- a/vortex-row/src/encode.rs +++ b/vortex-row/src/encode.rs @@ -11,7 +11,6 @@ use std::sync::Arc; use vortex_array::ArrayRef; -use vortex_array::Canonical; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::arrays::ListViewArray; @@ -25,7 +24,6 @@ use vortex_array::scalar_fn::ExecutionArgs; use vortex_array::scalar_fn::ScalarFnId; use vortex_array::scalar_fn::ScalarFnVTable; use vortex_array::validity::Validity; -use vortex_buffer::Buffer; use vortex_buffer::BufferMut; use vortex_error::VortexExpect; use vortex_error::VortexResult; @@ -34,7 +32,6 @@ use vortex_session::VortexSession; use crate::codec; use crate::options::RowEncodingOptions; -use crate::options::RowSortField; use crate::options::deserialize_row_encoding_options; use crate::options::serialize_row_encoding_options; use crate::size::compute_sizes; @@ -107,6 +104,9 @@ fn execute_row_encode( ctx: &mut ExecutionCtx, ) -> VortexResult { let nrows = args.row_count(); + if u32::try_from(nrows).is_err() { + vortex_bail!("row-encoded input has {} rows, exceeds u32::MAX", nrows); + } // ===== Phase 1: classify + size pass ===== let crate::size::SizePassResult { @@ -122,7 +122,9 @@ fn execute_row_encode( let total: u64 = (nrows as u64) .checked_mul(u64::from(fixed_per_row)) .and_then(|t| t.checked_add(var_total)) - .vortex_expect("row-encoded total bytes overflow"); + .ok_or_else(|| { + vortex_error::vortex_err!("row-encoded total bytes overflow u64 (nrows * fixed + var)") + })?; if total > u32::MAX as u64 { vortex_bail!("row-encoded output size {} bytes exceeds u32::MAX", total); } @@ -138,44 +140,42 @@ fn execute_row_encode( // listview_offsets[i] is the absolute byte offset where row `i` begins. // For pure-fixed: i * fixed_per_row. // For mixed: i * fixed_per_row + exclusive prefix sum of var_lengths. - let mut listview_offsets: Vec = Vec::with_capacity(nrows); + // Build directly into a BufferMut to avoid a Vec→Buffer copy at the end. + let nrows_u32 = + u32::try_from(nrows).vortex_expect("nrows fits u32 (validated earlier in this function)"); + let mut listview_offsets: BufferMut = BufferMut::with_capacity(nrows); match var_lengths.as_ref() { None => { - for i in 0..nrows { - let row_idx = - u32::try_from(i).vortex_expect("row index must fit in u32 after validation"); - listview_offsets.push( - row_idx - .checked_mul(fixed_per_row) - .vortex_expect("row offset overflow (already validated total fits in u32)"), - ); + for row_idx in 0..nrows_u32 { + // Total bytes already fit in u32, so row_idx * fixed_per_row also does. + listview_offsets.push(row_idx * fixed_per_row); } } Some(v) => { let mut acc: u32 = 0; - for (i, &l) in v.iter().enumerate() { - let row_idx = - u32::try_from(i).vortex_expect("row index must fit in u32 after validation"); - let off = row_idx - .checked_mul(fixed_per_row) - .and_then(|t| t.checked_add(acc)) - .vortex_expect("row offset overflow"); - listview_offsets.push(off); - acc = acc.checked_add(l).vortex_expect("varlen prefix overflow"); + for (row_idx, &l) in (0..nrows_u32).zip(v.iter()) { + // The arithmetic below cannot overflow because we already verified the + // total fits in u32. + listview_offsets.push(row_idx * fixed_per_row + acc); + acc += l; } } } + let listview_offsets_slice: &[u32] = listview_offsets.as_slice(); - // Per-row write cursor (also doubles as the ListView `sizes` slot when done). - let mut row_cursors = vec![0u32; nrows]; + // Per-row write cursor (also doubles as the ListView `sizes` slot when done). We build + // it as a BufferMut so we can hand it directly to the output PrimitiveArray. + let mut row_cursors: BufferMut = BufferMut::with_capacity(nrows); + row_cursors.push_n(0u32, nrows); // ===== Phase 4: encode columns via the cursor path ===== - for (i, col) in columns.iter().enumerate() { - dispatch_encode( - col, + // Each column was canonicalized once during the size pass; reuse that canonical form. + for (i, canonical) in columns.iter().enumerate() { + codec::field_encode( + canonical, options.fields[i], - &listview_offsets, - &mut row_cursors, + listview_offsets_slice, + row_cursors.as_mut_slice(), &mut out_buf, ctx, )?; @@ -183,34 +183,11 @@ fn execute_row_encode( // ===== Phase 5: build ListView output ===== let elements = PrimitiveArray::new(out_buf.freeze(), Validity::NonNullable).into_array(); - let offsets_arr = PrimitiveArray::new( - Buffer::::copy_from(&listview_offsets), - Validity::NonNullable, - ) - .into_array(); - let sizes_arr = PrimitiveArray::new( - Buffer::::copy_from(&row_cursors), - Validity::NonNullable, - ) - .into_array(); + let offsets_arr = + PrimitiveArray::new(listview_offsets.freeze(), Validity::NonNullable).into_array(); + let sizes_arr = PrimitiveArray::new(row_cursors.freeze(), Validity::NonNullable).into_array(); Ok( ListViewArray::try_new(elements, offsets_arr, sizes_arr, Validity::NonNullable)? .into_array(), ) } - -/// Dispatch a single column's encoding into the shared `out` buffer through the canonical path. -/// -/// TODO(row): add per-encoding fast paths here so Constant, Dictionary, and compressed arrays -/// can write row bytes without canonicalizing. -pub(crate) fn dispatch_encode( - col: &ArrayRef, - field: RowSortField, - offsets: &[u32], - cursors: &mut [u32], - out: &mut [u8], - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - let canonical = col.clone().execute::(ctx)?; - codec::field_encode(&canonical, field, offsets, cursors, out, ctx) -} diff --git a/vortex-row/src/size.rs b/vortex-row/src/size.rs index 48d4f8e4dbc..26269081ce7 100644 --- a/vortex-row/src/size.rs +++ b/vortex-row/src/size.rs @@ -26,7 +26,6 @@ use vortex_array::scalar_fn::ScalarFnId; use vortex_array::scalar_fn::ScalarFnVTable; use vortex_array::validity::Validity; use vortex_buffer::Buffer; -use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_session::VortexSession; @@ -34,16 +33,19 @@ use vortex_session::VortexSession; use crate::codec; use crate::codec::RowWidth; use crate::options::RowEncodingOptions; -use crate::options::RowSortField; use crate::options::deserialize_row_encoding_options; use crate::options::serialize_row_encoding_options; /// Result of the size pass: enough information for both [`RowSize::execute`] and the /// downstream [`RowEncode`](super::encode::RowEncode) pipeline. +/// +/// `columns` holds the canonicalized form of each input so the encode pass can write bytes +/// without re-decoding — a single canonicalization per column is shared between size and +/// encode. pub(crate) struct SizePassResult { pub fixed_per_row: u32, pub var_lengths: Option>, - pub columns: Vec, + pub columns: Vec, } /// Walk N input columns once, classifying each as fixed-width or variable-length and @@ -74,7 +76,7 @@ pub(crate) fn compute_sizes( } let nrows = args.row_count(); - let mut columns: Vec = Vec::with_capacity(n_inputs); + let mut columns: Vec = Vec::with_capacity(n_inputs); let mut fixed_per_row: u32 = 0; let mut var_lengths: Option> = None; @@ -88,18 +90,21 @@ pub(crate) fn compute_sizes( nrows ); } - match codec::row_width_for_dtype(col.dtype())? { + let width = codec::row_width_for_dtype(col.dtype())?; + // Canonicalize once and reuse for both sizing (variable columns) and encoding. + let canonical = col.execute::(ctx)?; + match width { RowWidth::Fixed(w) => { - fixed_per_row = fixed_per_row - .checked_add(w) - .vortex_expect("row width overflow"); + fixed_per_row = fixed_per_row.checked_add(w).ok_or_else(|| { + vortex_error::vortex_err!("per-row fixed width overflows u32 at column {}", i) + })?; } RowWidth::Variable => { let v = var_lengths.get_or_insert_with(|| vec![0u32; nrows]); - dispatch_size(&col, options.fields[i], v, ctx)?; + codec::field_size(&canonical, options.fields[i], v, ctx)?; } } - columns.push(col); + columns.push(canonical); } Ok(SizePassResult { @@ -109,7 +114,8 @@ pub(crate) fn compute_sizes( }) } -/// Variadic scalar function that, given N input columns and per-column [`RowSortField`]s, +/// Variadic scalar function that, given N input columns and per-column +/// [`RowSortField`](crate::RowSortField)s, /// returns a `Struct { fixed: U32, var: U32 }` array of per-row byte sizes for the /// row-oriented encoding produced by [`RowEncode`](super::encode::RowEncode). /// @@ -208,17 +214,3 @@ impl ScalarFnVTable for RowSize { false } } - -/// Dispatch a single column's per-row size contribution through the canonical path. -/// -/// TODO(row): add per-encoding fast paths here so Constant, Dictionary, and compressed arrays -/// can contribute row sizes without canonicalizing. -pub(crate) fn dispatch_size( - col: &ArrayRef, - field: RowSortField, - sizes: &mut [u32], - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - let canonical = col.clone().execute::(ctx)?; - codec::field_size(&canonical, field, sizes, ctx) -} diff --git a/vortex-row/src/tests.rs b/vortex-row/src/tests.rs index a1eaadf3803..62e0e4cfb98 100644 --- a/vortex-row/src/tests.rs +++ b/vortex-row/src/tests.rs @@ -323,8 +323,9 @@ fn row_size_struct_shape() -> VortexResult<()> { let var_prim = var.clone().execute::(&mut ctx)?; let v: &[u32] = var_prim.as_slice(); assert_eq!(v.len(), 5); - // empty string: sentinel(1) + 1 byte; non-empty: sentinel(1) + 33 bytes (single block). - let expected: Vec = vec![34, 34, 34, 2, 34]; + // empty string: just the empty sentinel (1 byte); null or non-empty: + // sentinel(1) + 33 bytes (single block). + let expected: Vec = vec![34, 34, 34, 1, 34]; assert_eq!(v, expected.as_slice()); Ok(()) } @@ -362,3 +363,213 @@ fn single_buffer_invariant() -> VortexResult<()> { ); Ok(()) } + +/// Regression: with the previous 2-sentinel varlen scheme, an empty col1 followed by a +/// non-empty col1 that happened to start with `\0` would corrupt multi-column lex order +/// because col2's first byte aligned against col1's pad in the longer row. With the +/// 3-sentinel scheme byte position 0 alone distinguishes empty from non-empty, so column +/// boundaries always align. +#[test] +fn multi_column_varlen_empty_vs_nul_byte_string() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + // col1: empty vs single 0-byte. col2: same int for all rows. + let col1 = VarBinViewArray::from_iter_str(["", "\0", "a", "ab"]).into_array(); + let col2 = PrimitiveArray::from_iter([1i32, 1, 1, 1]).into_array(); + let encoded = convert_columns( + &[col1, col2], + &[RowSortField::default(), RowSortField::default()], + &mut ctx, + )?; + let rows = collect_row_bytes(&encoded); + + // Logical natural order of col1: "" < "\0" < "a" < "ab". + // Byte sort of the encoded rows must put them in that same order. + let sorted_indices_by_bytes = { + let mut indices: Vec = (0..rows.len()).collect(); + indices.sort_by(|a, b| rows[*a].cmp(&rows[*b])); + indices + }; + assert_eq!( + sorted_indices_by_bytes, + vec![0, 1, 2, 3], + "byte sort must match natural col1 order; sorted indices were {:?}", + sorted_indices_by_bytes + ); + Ok(()) +} + +/// Regression: null col1 must sort distinct from empty col1 even when col2 follows. With +/// the 3-sentinel scheme null=0x00, empty=0x01 differ at byte 0. +#[test] +fn multi_column_varlen_null_vs_empty() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let col1 = VarBinViewArray::from_iter_nullable_str([ + None::<&str>, + Some(""), + Some("a"), + None, + Some(""), + ]) + .into_array(); + let col2 = PrimitiveArray::from_iter([1i32, 1, 1, 1, 1]).into_array(); + let encoded = convert_columns( + &[col1, col2], + &[RowSortField::ascending(), RowSortField::ascending()], + &mut ctx, + )?; + let rows = collect_row_bytes(&encoded); + + // Nulls first, then empties, then non-empties — and all the col2 values are identical + // so col1 fully determines the order. + // Categorise each row by the leading byte of col1's encoding. + let mut buckets: [Vec; 3] = [Vec::new(), Vec::new(), Vec::new()]; + for (i, row) in rows.iter().enumerate() { + let bucket = match row[0] { + 0x00 => 0, // null + 0x01 => 1, // empty + 0x02 => 2, // non-empty + other => panic!("unexpected varlen sentinel: {:#x}", other), + }; + buckets[bucket].push(i); + } + assert_eq!(buckets[0].len(), 2, "two null col1 rows"); + assert_eq!(buckets[1].len(), 2, "two empty col1 rows"); + assert_eq!(buckets[2].len(), 1, "one non-empty col1 row"); + + // All null rows must be byte-equal (same col2 value, both col1 null, single sentinel). + let null_rows: Vec<&Vec> = buckets[0].iter().map(|&i| &rows[i]).collect(); + assert_eq!( + null_rows[0], null_rows[1], + "null col1 rows must be byte-equal" + ); + // Same for empty. + let empty_rows: Vec<&Vec> = buckets[1].iter().map(|&i| &rows[i]).collect(); + assert_eq!( + empty_rows[0], empty_rows[1], + "empty col1 rows must be byte-equal" + ); + + // Byte sort must group: nulls, empties, non-empties (because leading byte differs). + let mut sorted = rows.clone(); + sorted.sort(); + assert_eq!(sorted[0][0], 0x00); + assert_eq!(sorted[1][0], 0x00); + assert_eq!(sorted[2][0], 0x01); + assert_eq!(sorted[3][0], 0x01); + assert_eq!(sorted[4][0], 0x02); + Ok(()) +} + +/// Regression: descending varlen must put non-empty before empty (natural "" < "a" inverts +/// to "a" < "" under descending). The 3-sentinel scheme uses `!empty < !non_empty` so +/// non-empty's first byte is smaller than empty's first byte. +#[test] +fn varlen_descending_empty_vs_non_empty() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let col = VarBinViewArray::from_iter_str(["a", "", "abc"]).into_array(); + let encoded = convert_columns(&[col], &[RowSortField::descending()], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + + // Natural order: "" < "a" < "abc"; descending byte sort: "abc" first, "" last. + let mut sorted = rows.clone(); + sorted.sort(); + // sorted[0] = encoded("abc"), sorted[1] = encoded("a"), sorted[2] = encoded("") + assert_eq!(sorted[0], rows[2], "abc first in descending"); + assert_eq!(sorted[1], rows[0], "a second"); + assert_eq!(sorted[2], rows[1], "empty last"); + Ok(()) +} + +/// Regression: two null parent struct rows whose underlying child values differ in length +/// must still produce byte-equal encodings, because the parent emits a canonical null +/// body (one null sentinel per variable child) regardless of the underlying values. +#[test] +fn null_struct_rows_with_varying_child_lengths_are_byte_equal() -> VortexResult<()> { + use vortex_array::arrays::StructArray; + use vortex_array::dtype::FieldName; + use vortex_array::dtype::FieldNames; + use vortex_array::validity::Validity; + use vortex_buffer::BitBuffer; + + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + // Build a nullable struct{name: utf8} where rows 0 and 2 are null but the underlying + // child has different length data ("short" vs "much longer text data"). + let names = + VarBinViewArray::from_iter_str(["short", "x", "much longer text data"]).into_array(); + let field_names = FieldNames::from([FieldName::from("name")]); + let bits = BitBuffer::from_iter([false, true, false]); + let validity = Validity::from(bits); + let struct_arr = StructArray::try_new(field_names, vec![names], 3, validity)?.into_array(); + + let encoded = convert_columns(&[struct_arr], &[RowSortField::ascending()], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + assert_eq!(rows.len(), 3); + // Both null parent rows must produce identical bytes despite the divergent children. + assert_eq!( + rows[0], rows[2], + "two null parent struct rows must encode to byte-equal slices" + ); + // And the non-null row's leading sentinel must differ from the null sentinel. + assert_ne!(rows[0][0], rows[1][0], "null vs non-null sentinel differs"); + Ok(()) +} + +#[test] +fn primitive_f32_sort_order() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let values: Vec = vec![-1.5, 0.0, 1.5, f32::INFINITY, f32::NEG_INFINITY]; + let col = PrimitiveArray::from_iter(values.clone()).into_array(); + let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + let mut sorted_rows = rows.clone(); + sorted_rows.sort(); + let mut sorted_idx: Vec = (0..values.len()).collect(); + sorted_idx.sort_by(|a, b| values[*a].partial_cmp(&values[*b]).unwrap()); + let expected: Vec> = sorted_idx.iter().map(|&i| rows[i].clone()).collect(); + assert_eq!(sorted_rows, expected); + Ok(()) +} + +#[test] +fn primitive_f16_sort_order() -> VortexResult<()> { + use vortex_array::dtype::half::f16; + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let values: Vec = vec![ + f16::from_f32(-1.5), + f16::from_f32(0.0), + f16::from_f32(1.5), + f16::INFINITY, + f16::NEG_INFINITY, + ]; + let col = PrimitiveArray::from_iter(values.clone()).into_array(); + let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + let mut sorted_rows = rows.clone(); + sorted_rows.sort(); + let mut sorted_idx: Vec = (0..values.len()).collect(); + sorted_idx.sort_by(|a, b| values[*a].partial_cmp(&values[*b]).unwrap()); + let expected: Vec> = sorted_idx.iter().map(|&i| rows[i].clone()).collect(); + assert_eq!(sorted_rows, expected); + Ok(()) +} + +#[test] +fn reject_list_dtype_early() { + use vortex_array::ArrayRef; + use vortex_array::arrays::ListArray; + use vortex_array::validity::Validity; + use vortex_buffer::buffer; + + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let offsets = PrimitiveArray::new(buffer![0u32, 1, 2], Validity::NonNullable).into_array(); + let elements = PrimitiveArray::from_iter([10i32, 20]).into_array(); + let list: ArrayRef = ListArray::try_new(elements, offsets, Validity::NonNullable) + .unwrap() + .into_array(); + let err = convert_columns(&[list], &[RowSortField::default()], &mut ctx) + .expect_err("List should not be accepted"); + assert!( + err.to_string().contains("List"), + "expected error mentioning List, got: {err}" + ); +}