diff --git a/vortex-buffer/benches/vortex_bitbuffer.rs b/vortex-buffer/benches/vortex_bitbuffer.rs index 67ce88da889..7efc6c6a114 100644 --- a/vortex-buffer/benches/vortex_bitbuffer.rs +++ b/vortex-buffer/benches/vortex_bitbuffer.rs @@ -210,6 +210,17 @@ fn bitwise_and_arrow_buffer(bencher: Bencher, length: usize) { .bench_refs(|(a, b)| &a.0 & &b.0); } +/// Owned-LHS AND: the left operand is a fresh, uniquely-owned `BitBuffer` each iteration, so +/// `bitwise_binary_op_lhs_owned` takes the in-place (zero-allocation) fast path. Compare against +/// `bitwise_and_vortex_buffer` (reference-LHS, which always allocates a result buffer). +#[divan::bench(args = INPUT_SIZE)] +fn bitand_owned_lhs_vortex_buffer(bencher: Bencher, length: usize) { + let b = BitBuffer::from_iter((0..length).map(|i| i % 3 == 0)); + bencher + .with_inputs(|| BitBuffer::from_iter((0..length).map(|i| i % 2 == 0))) + .bench_values(|a| a & &b); +} + #[divan::bench(args = INPUT_SIZE)] fn bitwise_or_vortex_buffer(bencher: Bencher, length: usize) { let a = BitBuffer::from_iter((0..length).map(|i| i % 2 == 0)); diff --git a/vortex-buffer/src/bit/buf.rs b/vortex-buffer/src/bit/buf.rs index 83b28e24b4a..457827f4ab9 100644 --- a/vortex-buffer/src/bit/buf.rs +++ b/vortex-buffer/src/bit/buf.rs @@ -25,7 +25,9 @@ use crate::bit::collect_bool_word; use crate::bit::count_ones::count_ones; use crate::bit::get_bit_unchecked; use crate::bit::ops::bitwise_binary_op; +use crate::bit::ops::bitwise_binary_op_lhs_owned; use crate::bit::ops::bitwise_unary_op; +use crate::bit::ops::bitwise_unary_op_copy; use crate::bit::select::bit_select; use crate::buffer; @@ -59,6 +61,29 @@ impl PartialEq for BitBuffer { return false; } + if self.len == 0 { + return true; + } + + // Fast path: both byte-aligned and same length — direct byte comparison. + if self.offset == 0 && other.offset == 0 { + let full_bytes = self.len / 8; + let self_bytes = &self.buffer.as_slice()[..full_bytes]; + let other_bytes = &other.buffer.as_slice()[..full_bytes]; + if self_bytes != other_bytes { + return false; + } + // Compare remaining bits in the last partial byte. + let rem = self.len % 8; + if rem != 0 { + let mask = (1u8 << rem) - 1; + let a = self.buffer.as_slice()[full_bytes] & mask; + let b = other.buffer.as_slice()[full_bytes] & mask; + return a == b; + } + return true; + } + self.chunks() .iter_padded() .zip(other.chunks().iter_padded()) @@ -315,6 +340,7 @@ impl BitBuffer { } /// Get the number of set bits in the buffer. + #[inline] pub fn true_count(&self) -> usize { count_ones(self.buffer.as_slice(), self.offset, self.len) } @@ -330,6 +356,7 @@ impl BitBuffer { } /// Get the number of unset bits in the buffer. + #[inline] pub fn false_count(&self) -> usize { self.len - self.true_count() } @@ -353,12 +380,14 @@ impl BitBuffer { pub fn sliced(&self) -> Self { if self.offset.is_multiple_of(8) { return Self::new( - self.buffer.slice(self.offset / 8..self.len.div_ceil(8)), + self.buffer + .slice(self.offset / 8..(self.offset + self.len).div_ceil(8)), self.len, ); } - bitwise_unary_op(self.clone(), |a| a) + // Allocate directly rather than clone + identity op which would fail try_into_mut. + bitwise_unary_op_copy(self, |a| a) } } @@ -402,7 +431,7 @@ impl BitOr for BitBuffer { #[inline] fn bitor(self, rhs: Self) -> Self::Output { - BitOr::bitor(&self, &rhs) + bitwise_binary_op_lhs_owned(self, &rhs, |a, b| a | b) } } @@ -420,7 +449,7 @@ impl BitOr<&BitBuffer> for BitBuffer { #[inline] fn bitor(self, rhs: &BitBuffer) -> Self::Output { - (&self).bitor(rhs) + bitwise_binary_op_lhs_owned(self, rhs, |a, b| a | b) } } @@ -447,7 +476,7 @@ impl BitAnd<&BitBuffer> for BitBuffer { #[inline] fn bitand(self, rhs: &BitBuffer) -> Self::Output { - (&self).bitand(rhs) + bitwise_binary_op_lhs_owned(self, rhs, |a, b| a & b) } } @@ -456,7 +485,7 @@ impl BitAnd for BitBuffer { #[inline] fn bitand(self, rhs: BitBuffer) -> Self::Output { - (&self).bitand(&rhs) + bitwise_binary_op_lhs_owned(self, &rhs, |a, b| a & b) } } @@ -465,7 +494,9 @@ impl Not for &BitBuffer { #[inline] fn not(self) -> Self::Output { - !self.clone() + // Allocate directly rather than clone+try_into_mut, which always fails + // since the clone shares the Arc with the original reference. + bitwise_unary_op_copy(self, |a| !a) } } @@ -492,7 +523,7 @@ impl BitXor<&BitBuffer> for BitBuffer { #[inline] fn bitxor(self, rhs: &BitBuffer) -> Self::Output { - (&self).bitxor(rhs) + bitwise_binary_op_lhs_owned(self, rhs, |a, b| a ^ b) } } @@ -505,6 +536,11 @@ impl BitBuffer { bitwise_binary_op(self, rhs, |a, b| a & !b) } + /// Owned variant of [`bitand_not`](Self::bitand_not) that can mutate in-place when possible. + pub fn into_bitand_not(self, rhs: &BitBuffer) -> BitBuffer { + bitwise_binary_op_lhs_owned(self, rhs, |a, b| a & !b) + } + /// Iterate through bits in a buffer. /// /// # Arguments @@ -524,44 +560,23 @@ impl BitBuffer { return; } - let is_bit_set = |byte: u8, bit_idx: usize| (byte & (1 << bit_idx)) != 0; - let bit_offset = self.offset % 8; - let mut buffer_ptr = unsafe { self.buffer.as_ptr().add(self.offset / 8) }; - let mut callback_idx = 0; - - // Handle incomplete first byte. - if bit_offset > 0 { - let bits_in_first_byte = (8 - bit_offset).min(total_bits); - let byte = unsafe { *buffer_ptr }; - - for bit_idx in 0..bits_in_first_byte { - f(callback_idx, is_bit_set(byte, bit_offset + bit_idx)); - callback_idx += 1; - } - - buffer_ptr = unsafe { buffer_ptr.add(1) }; - } - - // Process complete bytes. - let complete_bytes = (total_bits - callback_idx) / 8; - for _ in 0..complete_bytes { - let byte = unsafe { *buffer_ptr }; + // Process in 64-bit chunks for better ILP and fewer loop iterations. + let chunks = self.chunks(); + let chunks_count = total_bits / 64; + let remainder = total_bits % 64; - for bit_idx in 0..8 { - f(callback_idx, is_bit_set(byte, bit_idx)); - callback_idx += 1; + for (chunk_idx, chunk) in chunks.iter().enumerate() { + let base = chunk_idx * 64; + for bit_idx in 0..64 { + f(base + bit_idx, (chunk >> bit_idx) & 1 == 1); } - buffer_ptr = unsafe { buffer_ptr.add(1) }; } - // Handle remaining bits at the end. - let remaining_bits = total_bits - callback_idx; - if remaining_bits > 0 { - let byte = unsafe { *buffer_ptr }; - - for bit_idx in 0..remaining_bits { - f(callback_idx, is_bit_set(byte, bit_idx)); - callback_idx += 1; + if remainder != 0 { + let rem_chunk = chunks.remainder_bits(); + let base = chunks_count * 64; + for bit_idx in 0..remainder { + f(base + bit_idx, (rem_chunk >> bit_idx) & 1 == 1); } } } diff --git a/vortex-buffer/src/bit/buf_mut.rs b/vortex-buffer/src/bit/buf_mut.rs index a366eaae9a8..684295f60d9 100644 --- a/vortex-buffer/src/bit/buf_mut.rs +++ b/vortex-buffer/src/bit/buf_mut.rs @@ -275,6 +275,7 @@ impl BitBufferMut { /// Set the bit at `index` to the given boolean value. /// /// This operation is checked so if `index` exceeds the buffer length, this will panic. + #[inline] pub fn set_to(&mut self, index: usize, value: bool) { if value { self.set(index); @@ -288,6 +289,7 @@ impl BitBufferMut { /// # Safety /// /// The caller must ensure that `index` does not exceed the largest bit index in the backing buffer. + #[inline] pub unsafe fn set_to_unchecked(&mut self, index: usize, value: bool) { if value { // SAFETY: checked by caller @@ -301,6 +303,7 @@ impl BitBufferMut { /// Set a position to `true`. /// /// This operation is checked so if `index` exceeds the buffer length, this will panic. + #[inline] pub fn set(&mut self, index: usize) { assert!(index < self.len, "index {index} exceeds len {}", self.len); @@ -373,12 +376,21 @@ impl BitBufferMut { return; } - let new_len_bytes = (self.offset + len).div_ceil(8); + let end_bit = self.offset + len; + let new_len_bytes = end_bit.div_ceil(8); self.buffer.truncate(new_len_bytes); self.len = len; + + // Clear stale bits in the final partial byte so the "bits beyond len are zero" invariant + // holds. `append_false` (and `append_buffer`) rely on it to avoid a read-modify-write. + if !end_bit.is_multiple_of(8) { + let keep = (1u8 << (end_bit % 8)) - 1; + self.buffer.as_mut_slice()[new_len_bytes - 1] &= keep; + } } /// Append a new boolean into the bit buffer, incrementing the length. + #[inline] pub fn append(&mut self, value: bool) { if value { self.append_true() @@ -388,6 +400,7 @@ impl BitBufferMut { } /// Append a new true value to the buffer. + #[inline] pub fn append_true(&mut self) { let bit_pos = self.offset + self.len; let byte_pos = bit_pos / 8; @@ -404,21 +417,18 @@ impl BitBufferMut { } /// Append a new false value to the buffer. + #[inline] pub fn append_false(&mut self) { let bit_pos = self.offset + self.len; let byte_pos = bit_pos / 8; - let bit_in_byte = bit_pos % 8; - // Ensure buffer has enough bytes + // Ensure buffer has enough bytes (pushed as 0x00, so bit is already unset). if byte_pos >= self.buffer.len() { self.buffer.push(0u8); } - // Bit is already 0 if we just pushed a new byte, otherwise ensure it's unset - if bit_in_byte != 0 { - self.buffer.as_mut_slice()[byte_pos] &= !(1 << bit_in_byte); - } - + // The bit is guaranteed to be 0: new bytes are zero-initialized, and + // existing bytes have this bit unset (it's beyond the current length). self.len += 1; } @@ -487,24 +497,39 @@ impl BitBufferMut { let end_bit_pos = start_bit_pos + bit_len; let required_bytes = end_bit_pos.div_ceil(8); - // Ensure buffer has enough bytes + // Ensure buffer has enough bytes, zero-initialized for OR-based writes. if required_bytes > self.buffer.len() { self.buffer.push_n(0x00, required_bytes - self.buffer.len()); } - // Use bitvec for efficient bit copying - let self_slice = self - .buffer - .as_mut_slice() - .view_bits_mut::(); - let other_slice = buffer - .inner() - .as_slice() - .view_bits::(); - - // Copy from source buffer (accounting for its offset) to destination (accounting for our offset + len) - let source_range = buffer.offset()..buffer.offset() + bit_len; - self_slice[start_bit_pos..end_bit_pos].copy_from_bitslice(&other_slice[source_range]); + let dst_bit_offset = start_bit_pos % 8; + let src_bit_offset = buffer.offset(); + + if dst_bit_offset == 0 && src_bit_offset == 0 { + // Both byte-aligned: use memcpy for full bytes, then mask the tail. + let dst_byte = start_bit_pos / 8; + let src_bytes = buffer.inner().as_slice(); + let full_bytes = bit_len / 8; + self.buffer.as_mut_slice()[dst_byte..dst_byte + full_bytes] + .copy_from_slice(&src_bytes[..full_bytes]); + let rem = bit_len % 8; + if rem != 0 { + let mask = (1u8 << rem) - 1; + self.buffer.as_mut_slice()[dst_byte + full_bytes] |= src_bytes[full_bytes] & mask; + } + } else { + // Use bitvec for unaligned bit copying. + let self_slice = self + .buffer + .as_mut_slice() + .view_bits_mut::(); + let other_slice = buffer + .inner() + .as_slice() + .view_bits::(); + let source_range = src_bit_offset..src_bit_offset + bit_len; + self_slice[start_bit_pos..end_bit_pos].copy_from_bitslice(&other_slice[source_range]); + } self.len += bit_len; } @@ -611,7 +636,8 @@ impl FromIterator for BitBufferMut { } } - // Append the remaining items (as we do not know how many more there are). + // Append any remaining items one at a time, as we do not know how many more there are. + // (`append` is already a single branch + bit set, see `append_true`/`append_false`.) for v in iter { buf.append(v); } @@ -659,6 +685,24 @@ mod tests { assert!(bools.value(9)); } + #[test] + fn append_false_after_truncate_reads_back_false() { + // `truncate` leaves stale bits in the final partial byte; a subsequent `append_false` + // must still read back as false. Regression test for the `append_false` fast path. + let mut bools = BitBufferMut::new_set(16); + bools.truncate(12); + bools.append_false(); + bools.append_true(); + + let bools = bools.freeze(); + assert_eq!(bools.len(), 14); + assert!( + !bools.value(12), + "appended false must read back false after truncate" + ); + assert!(bools.value(13)); + } + #[test] fn test_reserve_ensures_len_plus_additional() { // This test documents the fix for the bug where reserve was incorrectly diff --git a/vortex-buffer/src/bit/ops.rs b/vortex-buffer/src/bit/ops.rs index 9406c728f06..9b1a2bb4c15 100644 --- a/vortex-buffer/src/bit/ops.rs +++ b/vortex-buffer/src/bit/ops.rs @@ -4,86 +4,106 @@ use crate::BitBuffer; use crate::BitBufferMut; use crate::Buffer; -use crate::ByteBufferMut; use crate::trusted_len::TrustedLenExt; +/// Read up to 8 bytes as a little-endian `u64`, zero-padding the high bytes when fewer than 8 are +/// supplied. Using [`u64::from_le_bytes`] keeps the bit-numbering identical on little- and +/// big-endian targets; for a full 8-byte slice it lowers to a single word load. #[inline] -pub(super) fn bitwise_unary_op u64>(buffer: BitBuffer, mut op: F) -> BitBuffer { +fn read_u64_le(bytes: &[u8]) -> u64 { + debug_assert!(bytes.len() <= 8); + let mut buf = [0u8; 8]; + buf[..bytes.len()].copy_from_slice(bytes); + u64::from_le_bytes(buf) +} + +/// Apply `op` to each little-endian `u64` word of `data` in place. +/// +/// `data` is split into full `u64` words via [`slice::as_chunks`], with the trailing +/// `data.len() % 8` bytes handled as one final partial word (see [`read_u64_le`]). +#[inline] +fn map_u64_words_in_place u64>(data: &mut [u8], mut op: F) { + let (words, tail) = data.as_chunks_mut::<8>(); + for word in words { + *word = op(u64::from_le_bytes(*word)).to_le_bytes(); + } + if !tail.is_empty() { + let word = op(read_u64_le(tail)).to_le_bytes(); + tail.copy_from_slice(&word[..tail.len()]); + } +} + +/// Combine each little-endian `u64` word of `dst` with the matching word of `src` via `op`, +/// writing the result back into `dst`. Processes `dst.len().min(src.len())` bytes; see +/// [`map_u64_words_in_place`] for the partial-word handling. +#[inline] +fn zip_u64_words_in_place u64>(dst: &mut [u8], src: &[u8], mut op: F) { + let n = dst.len().min(src.len()); + let (dst_words, dst_tail) = dst[..n].as_chunks_mut::<8>(); + let (src_words, src_tail) = src[..n].as_chunks::<8>(); + for (d, s) in dst_words.iter_mut().zip(src_words) { + *d = op(u64::from_le_bytes(*d), u64::from_le_bytes(*s)).to_le_bytes(); + } + // Both slices were truncated to `n`, so their tails are the same length. + if !dst_tail.is_empty() { + let word = op(read_u64_le(dst_tail), read_u64_le(src_tail)).to_le_bytes(); + dst_tail.copy_from_slice(&word[..dst_tail.len()]); + } +} + +/// Apply a unary operation to a [`BitBuffer`], always allocating a new output buffer. +#[inline] +pub(super) fn bitwise_unary_op_copy u64>(buffer: &BitBuffer, op: F) -> BitBuffer { + let mut buf = BitBufferMut::copy_from(buffer); + map_u64_words_in_place(buf.as_mut_slice(), op); + buf.freeze() +} + +/// Apply a unary operation to an owned [`BitBuffer`], mutating in-place when possible. +/// +/// Tries to get exclusive access via `try_into_mut`. If the backing storage is shared +/// (Arc refcount > 1), falls back to [`bitwise_unary_op_copy`]. +#[inline] +pub(super) fn bitwise_unary_op u64>(buffer: BitBuffer, op: F) -> BitBuffer { match buffer.try_into_mut() { Ok(mut buf) => { bitwise_unary_op_mut(&mut buf, op); buf.freeze() } - Err(buffer) => { - let len = buffer.len(); - let offset = buffer.offset(); - let src = buffer.inner().as_slice(); - - let mut dst = ByteBufferMut::with_capacity(src.len()); - let u64_len = src.len() / 8; - let remainder = src.len() % 8; - - let mut src_ptr = src.as_ptr() as *const u64; - let mut dst_ptr = dst.spare_capacity_mut().as_mut_ptr() as *mut u64; - for _ in 0..u64_len { - let value = unsafe { src_ptr.read_unaligned() }; - unsafe { dst_ptr.write_unaligned(op(value)) }; - src_ptr = unsafe { src_ptr.add(1) }; - dst_ptr = unsafe { dst_ptr.add(1) }; - } - - if remainder > 0 { - let mut remainder_u64 = 0u64; - let src_bytes = src_ptr as *const u8; - let dst_bytes = dst_ptr as *mut u8; - for i in 0..remainder { - let byte = unsafe { src_bytes.add(i).read() }; - remainder_u64 |= (byte as u64) << (i * 8); - } - let remainder_u64 = op(remainder_u64); - for i in 0..remainder { - let byte = ((remainder_u64 >> (i * 8)) & 0xFF) as u8; - unsafe { dst_bytes.add(i).write(byte) }; - } - } - - // SAFETY: we wrote exactly src.len() bytes into the spare capacity. - unsafe { dst.set_len(src.len()) }; - BitBuffer::new_with_offset(dst.freeze(), len, offset) - } + Err(buffer) => bitwise_unary_op_copy(&buffer, op), } } #[inline] -pub(super) fn bitwise_unary_op_mut u64>(buffer: &mut BitBufferMut, mut op: F) { - let slice_mut = buffer.as_mut_slice(); - - // The number of complete u64 words in the buffer (unaligned) - let u64_len = slice_mut.len() / 8; - let remainder = slice_mut.len() % 8; - - // Create a pointer to the *unaligned* u64 words - let mut ptr = slice_mut.as_mut_ptr() as *mut u64; - for _ in 0..u64_len { - let value = unsafe { ptr.read_unaligned() }; - let value = op(value); - unsafe { ptr.write_unaligned(value) }; - ptr = unsafe { ptr.add(1) }; - } +pub(super) fn bitwise_unary_op_mut u64>(buffer: &mut BitBufferMut, op: F) { + map_u64_words_in_place(buffer.as_mut_slice(), op); +} + +/// Apply a binary operation with an owned left operand, mutating in-place when possible. +/// +/// Tries `try_into_mut` on the left operand. If the backing storage has exclusive access, +/// the operation is performed in-place (zero allocation). Otherwise, falls back to +/// [`bitwise_binary_op`] which allocates a new buffer. +pub(super) fn bitwise_binary_op_lhs_owned u64>( + left: BitBuffer, + right: &BitBuffer, + op: F, +) -> BitBuffer { + assert_eq!(left.len(), right.len()); - // Read remainder into a u64; - let mut remainder_u64 = 0u64; - let ptr = ptr as *mut u8; - for i in 0..remainder { - let byte = unsafe { ptr.add(i).read() }; - remainder_u64 |= (byte as u64) << (i * 8); + // The in-place path combines the operands word-for-word, which only lines up the logical bits + // when both share the same bit-to-byte alignment. When the offsets differ, fall back to the + // offset-aware allocating path (`bitwise_binary_op`) rather than corrupting the result. + if left.offset() != right.offset() { + return bitwise_binary_op(&left, right, op); } - let remainder_u64 = op(remainder_u64); - // Write back remainder - for i in 0..remainder { - let byte = ((remainder_u64 >> (i * 8)) & 0xFF) as u8; - unsafe { ptr.add(i).write(byte) }; + match left.try_into_mut() { + Ok(mut buf) => { + zip_u64_words_in_place(buf.as_mut_slice(), right.inner().as_slice(), op); + buf.freeze() + } + Err(left) => bitwise_binary_op(&left, right, op), } } @@ -129,7 +149,10 @@ pub(super) fn bitwise_binary_op u64>( mod tests { use std::ops::Not; + use rstest::rstest; + use super::*; + use crate::ByteBufferMut; use crate::bitbuffer; use crate::buffer; @@ -140,6 +163,58 @@ mod tests { assert_eq!(result, bitbuffer![true, false, true, false]); } + #[test] + fn test_lhs_owned_offset_mismatch_regression() { + use crate::buffer_mut; + + // `left` has bit offset 3 and uniquely-owned backing storage, so the in-place fast + // path is taken. Byte 0b1111_1000 → logical bits [3..8) = [1,1,1,1,1]. + let left = BitBufferMut::from_buffer(buffer_mut![0b1111_1000u8], 3, 5).freeze(); + // `right` has bit offset 0. Byte 0b0001_1111 → logical bits [0..5) = [1,1,1,1,1]. + let right = BitBuffer::new(buffer![0b0001_1111u8], 5); + + // AND of two all-true ranges must be all-true. The naive byte-wise in-place path + // ignores the differing offsets and yields the wrong answer. + let got = bitwise_binary_op_lhs_owned(left, &right, |a, b| a & b); + assert_eq!(got.true_count(), 5); + assert_eq!(got, bitbuffer![true, true, true, true, true]); + } + + /// The owned-LHS path (in-place when uniquely owned and the offsets match) must produce the + /// same logical result as the always-correct allocating [`bitwise_binary_op`], for every + /// combination of operand offsets and lengths. + #[rstest] + #[case::aligned(0, 0)] + #[case::equal_nonzero(3, 3)] + #[case::equal_seven(7, 7)] + #[case::mismatch_lo(0, 3)] + #[case::mismatch_hi(5, 2)] + fn lhs_owned_matches_reference(#[case] left_offset: usize, #[case] right_offset: usize) { + // Deterministic byte pattern, so the owned and borrowed inputs are bit-identical. + #[allow(clippy::cast_possible_truncation)] + let make = |offset: usize, len: usize, salt: u8| -> BitBuffer { + let bytes: ByteBufferMut = (0..(offset + len).div_ceil(8).max(1)) + .map(|i| (i as u8).wrapping_mul(31).wrapping_add(salt)) + .collect(); + BitBufferMut::from_buffer(bytes, offset, len).freeze() + }; + let ops: [fn(u64, u64) -> u64; 4] = + [|a, b| a & b, |a, b| a | b, |a, b| a ^ b, |a, b| a & !b]; + + for len in [1usize, 5, 8, 63, 64, 65, 129, 200] { + let right = make(right_offset, len, 0x5A); + for op in ops { + // A fresh, uniquely-owned LHS triggers the in-place path when offsets match. + let got = bitwise_binary_op_lhs_owned(make(left_offset, len, 0xC3), &right, op); + let expected = bitwise_binary_op(&make(left_offset, len, 0xC3), &right, op); + assert_eq!( + got, expected, + "loff={left_offset} roff={right_offset} len={len}" + ); + } + } + } + #[test] fn test_bitwise_binary_and() { // 0b1111 (15) & 0b1010 (10) = 0b1010 (10) diff --git a/vortex-cuda/src/layout.rs b/vortex-cuda/src/layout.rs index 76e5a5ba5fc..26b9d9a8d06 100644 --- a/vortex-cuda/src/layout.rs +++ b/vortex-cuda/src/layout.rs @@ -325,7 +325,8 @@ impl LayoutReader for CudaFlatReader { array = array.slice(row_range.clone())?; } - let array_mask = if mask.density() < EXPR_EVAL_THRESHOLD { + let mask_density = mask.density(); + let array_mask = if mask_density < EXPR_EVAL_THRESHOLD { let array = array.apply(&expr)?; let array = array.filter(mask.clone())?; let mut ctx = session.create_execution_ctx(); @@ -342,7 +343,7 @@ impl LayoutReader for CudaFlatReader { "CudaFlat mask evaluation {} - {} (mask = {}) => {}", name, expr, - mask.density(), + mask_density, array_mask.density(), ); diff --git a/vortex-layout/src/layouts/flat/reader.rs b/vortex-layout/src/layouts/flat/reader.rs index 8817fe5c678..f5e6b5ca405 100644 --- a/vortex-layout/src/layouts/flat/reader.rs +++ b/vortex-layout/src/layouts/flat/reader.rs @@ -148,7 +148,8 @@ impl LayoutReader for FlatReader { array = array.slice(row_range.clone())?; } - let array_mask = if mask.density() < EXPR_EVAL_THRESHOLD { + let mask_density = mask.density(); + let array_mask = if mask_density < EXPR_EVAL_THRESHOLD { // We have the choice to apply the filter or the expression first, we apply the // expression first so that it can try pushing down itself and then the filter // after this. @@ -171,7 +172,7 @@ impl LayoutReader for FlatReader { "Flat mask evaluation {} - {} (mask = {}) => {}", name, expr, - mask.density(), + mask_density, array_mask.density(), ); diff --git a/vortex-layout/src/layouts/zoned/reader.rs b/vortex-layout/src/layouts/zoned/reader.rs index a73ed6a547c..e31c687d049 100644 --- a/vortex-layout/src/layouts/zoned/reader.rs +++ b/vortex-layout/src/layouts/zoned/reader.rs @@ -172,6 +172,7 @@ impl LayoutReader for ZonedReader { assert_eq!(stats_mask.len(), mask.len(), "Mask length mismatch"); // Intersect the masks. + let mask_density = mask.density(); let mut stats_mask = mask.bitand(&stats_mask); // Forward to data child for further pruning. @@ -184,7 +185,7 @@ impl LayoutReader for ZonedReader { "Stats evaluation approx {} - {} (mask = {}) => {}", name, expr, - mask.density(), + mask_density, stats_mask.density(), ); diff --git a/vortex-mask/src/bitops.rs b/vortex-mask/src/bitops.rs index 94d7ef02980..4bebcf28c40 100644 --- a/vortex-mask/src/bitops.rs +++ b/vortex-mask/src/bitops.rs @@ -28,6 +28,26 @@ impl BitAnd for &Mask { } } +impl BitAnd<&Mask> for Mask { + type Output = Mask; + + /// Owned-left AND: can reuse the left buffer in-place when possible. + fn bitand(self, rhs: &Mask) -> Self::Output { + if self.len() != rhs.len() { + vortex_panic!("Masks must have the same length"); + } + + match (self.bit_buffer(), rhs.bit_buffer()) { + (AllOr::All, _) => rhs.clone(), + (AllOr::None, _) | (_, AllOr::None) => Mask::new_false(self.len()), + (_, AllOr::All) => self, + (AllOr::Some(_), AllOr::Some(rhs_buf)) => { + Mask::from_buffer(self.into_bit_buffer() & rhs_buf) + } + } + } +} + impl BitOr for &Mask { type Output = Mask; @@ -46,6 +66,26 @@ impl BitOr for &Mask { } } +impl BitOr<&Mask> for Mask { + type Output = Mask; + + /// Owned-left OR: can reuse the left buffer in-place when possible. + fn bitor(self, rhs: &Mask) -> Self::Output { + if self.len() != rhs.len() { + vortex_panic!("Masks must have the same length"); + } + + match (self.bit_buffer(), rhs.bit_buffer()) { + (AllOr::All, _) | (_, AllOr::All) => Mask::new_true(self.len()), + (AllOr::None, _) => rhs.clone(), + (_, AllOr::None) => self, + (AllOr::Some(_), AllOr::Some(rhs_buf)) => { + Mask::from_buffer(self.into_bit_buffer() | rhs_buf) + } + } + } +} + impl Mask { /// Computes `self & !rhs` (AND NOT), equivalent to set difference. pub fn bitand_not(self, rhs: &Mask) -> Mask { @@ -56,7 +96,9 @@ impl Mask { (AllOr::None, _) | (_, AllOr::All) => Mask::new_false(self.len()), (_, AllOr::None) => self, (AllOr::All, _) => !rhs, - (AllOr::Some(lhs), AllOr::Some(rhs)) => Mask::from_buffer(lhs.bitand_not(rhs)), + (AllOr::Some(_), AllOr::Some(rhs_buf)) => { + Mask::from_buffer(self.into_bit_buffer().into_bitand_not(rhs_buf)) + } } } }