Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
203 changes: 203 additions & 0 deletions encodings/runend/src/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use vortex_array::ArrayRef;
use vortex_array::ArrayView;
use vortex_array::ExecutionCtx;
use vortex_array::IntoArray;
use vortex_array::arrays::Bool;
use vortex_array::arrays::BoolArray;
use vortex_array::arrays::ConstantArray;
use vortex_array::arrays::Primitive;
Expand Down Expand Up @@ -94,6 +95,67 @@ pub fn runend_encode(
(ends, values)
}

/// Run-end encode a `BoolArray`, returning a tuple of `(ends, values)`.
///
/// The `values` array is a [`BoolArray`] for ordinary inputs. For all-invalid inputs it is a
/// single-row null [`ConstantArray`].
pub fn runend_encode_bool(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We used to have a custom run-end-bool encoding at some point.

The point is you don't need to store the "values" array because you know it just flip-flops. So you could say, values starts at true, and maybe offsets are 0, 0, 4, 10, ...

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But also... we already support run-end bools because we sometimes push-down a predicate expression over non-bool run-end types. So we can use this as a compression scheme now, and update later to preferred run-end bool if we want one.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

whenever we push down expression that end up with run end bools we immediately canonicalise them. The format we have is really wasteful for bools.

array: ArrayView<Bool>,
ctx: &mut ExecutionCtx,
) -> (PrimitiveArray, ArrayRef) {
let validity = match array
.validity()
.vortex_expect("run-end validity should be derivable")
{
Validity::NonNullable => None,
Validity::AllValid => None,
Validity::AllInvalid => {
let ends = PrimitiveArray::new(buffer![array.len() as u64], Validity::NonNullable)
.narrow(ctx)
.vortex_expect("Ends must succeed downcasting");
ends.statistics()
.set(Stat::IsStrictSorted, Precision::Exact(true.into()));
return (
ends,
ConstantArray::new(Scalar::null(array.dtype().clone()), 1).into_array(),
);
}
Validity::Array(a) => {
let bool_array = a
.execute::<BoolArray>(ctx)
.vortex_expect("validity array must be convertible to bool");
Some(bool_array.to_bit_buffer())
}
};

let bits = array.to_bit_buffer();
if bits.is_empty() {
let ends = PrimitiveArray::new(Buffer::<u64>::empty(), Validity::NonNullable)
.narrow(ctx)
.vortex_expect("Ends must succeed downcasting");
ends.statistics()
.set(Stat::IsStrictSorted, Precision::Exact(true.into()));
return (
ends,
BoolArray::new(BitBuffer::empty(), array.dtype().nullability().into()).into_array(),
);
}

let (ends, values) = match validity {
None => runend_encode_bools(&bits, array.dtype().nullability().into()),
Some(validity) => runend_encode_nullable_bools(&bits, validity),
};

let ends = PrimitiveArray::new(ends, Validity::NonNullable)
.narrow(ctx)
.vortex_expect("Ends must succeed downcasting");

ends.statistics()
.set(Stat::IsStrictSorted, Precision::Exact(true.into()));

(ends, values.into_array())
}

fn runend_encode_primitive<T: NativePType>(elements: &[T]) -> (Buffer<u64>, Buffer<T>) {
let mut ends = BufferMut::empty();
let mut values = BufferMut::empty();
Expand All @@ -119,6 +181,81 @@ fn runend_encode_primitive<T: NativePType>(elements: &[T]) -> (Buffer<u64>, Buff
(ends.freeze(), values.freeze())
}

fn runend_encode_bools(elements: &BitBuffer, validity: Validity) -> (Buffer<u64>, BoolArray) {
debug_assert!(!elements.is_empty());

let mut ends = BufferMut::empty();
let mut values = BitBufferMut::with_capacity(elements.len());

let mut prev = elements.value(0);
let mut end = 1;
for value in elements.iter().skip(1) {
if value != prev {
ends.push(end);
values.append(prev);
}
prev = value;
end += 1;
}
ends.push(end);
values.append(prev);

(ends.freeze(), BoolArray::new(values.freeze(), validity))
}

fn runend_encode_nullable_bools(
elements: &BitBuffer,
element_validity: BitBuffer,
) -> (Buffer<u64>, BoolArray) {
debug_assert!(!elements.is_empty());

let mut ends = BufferMut::empty();
let mut values = BitBufferMut::with_capacity(elements.len());
let mut validity = BitBufferMut::with_capacity(elements.len());

let mut prev = element_validity.value(0).then(|| elements.value(0));
let mut end = 1;
for value in elements
.iter()
.zip(element_validity.iter())
.map(|(value, is_valid)| is_valid.then_some(value))
.skip(1)
{
if value != prev {
ends.push(end);
match prev {
None => {
validity.append(false);
values.append(false);
}
Some(previous) => {
validity.append(true);
values.append(previous);
}
}
}
prev = value;
end += 1;
}
ends.push(end);

match prev {
None => {
validity.append(false);
values.append(false);
}
Some(previous) => {
validity.append(true);
values.append(previous);
}
}

(
ends.freeze(),
BoolArray::new(values.freeze(), Validity::from(validity.freeze())),
)
}

fn runend_encode_nullable_primitive<T: NativePType>(
elements: &[T],
element_validity: BitBuffer,
Expand Down Expand Up @@ -322,6 +459,7 @@ pub fn runend_decode_varbinview(
mod tests {
use vortex_array::LEGACY_SESSION;
use vortex_array::VortexSessionExecute;
use vortex_array::arrays::BoolArray;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::assert_arrays_eq;
use vortex_array::validity::Validity;
Expand All @@ -331,6 +469,7 @@ mod tests {

use crate::compress::runend_decode_primitive;
use crate::compress::runend_encode;
use crate::compress::runend_encode_bool;

#[test]
fn encode() -> VortexResult<()> {
Expand Down Expand Up @@ -383,6 +522,70 @@ mod tests {
Ok(())
}

#[test]
fn encode_bool() -> VortexResult<()> {
let mut ctx = LEGACY_SESSION.create_execution_ctx();
let arr = BoolArray::from_iter([true, true, false, false, false, true]);
let (ends, values) = runend_encode_bool(arr.as_view(), &mut ctx);
let values = values.execute::<BoolArray>(&mut ctx)?;

let expected_ends = PrimitiveArray::from_iter(vec![2u8, 5, 6]);
assert_arrays_eq!(ends, expected_ends);
let expected_values = BoolArray::from_iter([true, false, true]);
assert_arrays_eq!(values, expected_values);
Ok(())
}

#[test]
fn encode_bool_nullable() -> VortexResult<()> {
let mut ctx = LEGACY_SESSION.create_execution_ctx();
let arr = BoolArray::from_iter([
Some(true),
Some(true),
None,
None,
Some(false),
Some(false),
None,
Some(true),
]);
let (ends, values) = runend_encode_bool(arr.as_view(), &mut ctx);
let values = values.execute::<BoolArray>(&mut ctx)?;

let expected_ends = PrimitiveArray::from_iter(vec![2u8, 4, 6, 7, 8]);
assert_arrays_eq!(ends, expected_ends);
let expected_values =
BoolArray::from_iter([Some(true), None, Some(false), None, Some(true)]);
assert_arrays_eq!(values, expected_values);
Ok(())
}

#[test]
fn encode_bool_all_null() -> VortexResult<()> {
let mut ctx = LEGACY_SESSION.create_execution_ctx();
let arr = BoolArray::new(BitBuffer::new_unset(5), Validity::AllInvalid);
let (ends, values) = runend_encode_bool(arr.as_view(), &mut ctx);
let values = values.execute::<BoolArray>(&mut ctx)?;

let expected_ends = PrimitiveArray::from_iter(vec![5u8]);
assert_arrays_eq!(ends, expected_ends);
let expected_values = BoolArray::from_iter([Option::<bool>::None]);
assert_arrays_eq!(values, expected_values);
Ok(())
}

#[test]
fn encode_bool_empty() -> VortexResult<()> {
let mut ctx = LEGACY_SESSION.create_execution_ctx();
let arr = BoolArray::from_iter(Vec::<bool>::new());
let (ends, values) = runend_encode_bool(arr.as_view(), &mut ctx);
let values = values.execute::<BoolArray>(&mut ctx)?;

assert!(ends.is_empty());
assert!(values.is_empty());
Ok(())
}

#[test]
fn decode() -> VortexResult<()> {
let mut ctx = LEGACY_SESSION.create_execution_ctx();
Expand Down
1 change: 1 addition & 0 deletions vortex-btrblocks/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ pub const ALL_SCHEMES: &[&dyn Scheme] = &[
// Bool schemes.
////////////////////////////////////////////////////////////////////////////////////////////////
&bool::BoolConstantScheme,
&bool::BoolRunEndScheme,
////////////////////////////////////////////////////////////////////////////////////////////////
// Integer schemes.
////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down
56 changes: 56 additions & 0 deletions vortex-btrblocks/src/canonical_compressor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,17 @@ mod tests {
use vortex_array::arrays::BoolArray;
use vortex_array::arrays::Constant;
use vortex_array::arrays::List;
use vortex_array::arrays::ListArray;
use vortex_array::arrays::ListView;
use vortex_array::arrays::ListViewArray;
use vortex_array::arrays::list::ListArrayExt;
use vortex_array::assert_arrays_eq;
use vortex_array::session::ArraySession;
use vortex_array::validity::Validity;
use vortex_buffer::BitBuffer;
use vortex_buffer::buffer;
use vortex_error::VortexResult;
use vortex_runend::RunEnd;
use vortex_session::VortexSession;

use crate::BtrBlocksCompressor;
Expand Down Expand Up @@ -191,4 +194,57 @@ mod tests {
assert_arrays_eq!(compressed, array);
Ok(())
}

#[test]
fn test_bool_runend_compressed() -> VortexResult<()> {
let values = (0..4)
.flat_map(|i| {
let value = i % 2 == 0;
std::iter::repeat_n(value, 128)
})
.collect::<Vec<_>>();
let array = BoolArray::new(BitBuffer::from(values), Validity::NonNullable);
let btr = BtrBlocksCompressor::default();
let compressed = btr.compress(
&array.clone().into_array(),
&mut SESSION.create_execution_ctx(),
)?;

assert!(compressed.is::<RunEnd>());
assert_arrays_eq!(compressed, array);
Ok(())
}

#[test]
fn test_nested_bool_list_runend_compressed() -> VortexResult<()> {
let leaf_values = (0..4)
.flat_map(|i| {
let value = i % 2 == 0;
std::iter::repeat_n(value, 128)
})
.collect::<Vec<_>>();
let leaf = BoolArray::new(BitBuffer::from(leaf_values), Validity::NonNullable);
let inner = ListArray::try_new(
leaf.into_array(),
buffer![0u32, 128, 256, 384, 512].into_array(),
Validity::NonNullable,
)?;
let outer = ListArray::try_new(
inner.into_array(),
buffer![0u32, 2, 4].into_array(),
Validity::NonNullable,
)?;

let btr = BtrBlocksCompressor::default();
let compressed = btr.compress(
&outer.clone().into_array(),
&mut SESSION.create_execution_ctx(),
)?;

let outer_compressed = compressed.as_::<List>();
let inner_compressed = outer_compressed.elements().as_::<List>();
assert!(inner_compressed.elements().is::<RunEnd>());
assert_arrays_eq!(compressed, outer);
Ok(())
}
}
Loading
Loading