Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions vortex-array/src/arrays/listview/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use crate::LEGACY_SESSION;
#[expect(deprecated)]
use crate::ToCanonical as _;
use crate::VortexSessionExecute;
use crate::aggregate_fn::fns::min_max::min_max;
use crate::array::Array;
use crate::array::ArrayParts;
use crate::array::TypedArrayRef;
Expand All @@ -32,11 +33,14 @@ use crate::arrays::Primitive;
use crate::arrays::PrimitiveArray;
use crate::arrays::bool;
use crate::arrays::primitive::PrimitiveArrayExt;
use crate::builtins::ArrayBuiltins;
use crate::dtype::DType;
use crate::dtype::IntegerPType;
use crate::dtype::PType;
use crate::expr::stats::Stat;
use crate::match_each_integer_ptype;
use crate::match_each_unsigned_integer_ptype;
use crate::scalar_fn::fns::operators::Operator;
use crate::validity::Validity;

/// The `elements` data array, where each list scalar is a _slice_ of the `elements` array, and
Expand Down Expand Up @@ -522,6 +526,56 @@ pub trait ListViewArrayExt: TypedArrayRef<ListView> {

Ok(estimate)
}

/// Returns the half-open range `[start, end)` of `elements` indices referenced by any view:
/// the minimum offset and the maximum `offset + size`. Elements outside this range are
/// unreferenced leading or trailing slack that a
/// [`TrimElements`](super::ListViewRebuildMode::TrimElements) rebuild would reclaim.
///
/// For **zero-copy-to-list** arrays this is `O(1)`: views are sorted and non-overlapping with
/// no interior gaps, so the bounds are exactly `[first_offset, last_offset + last_size)`.
/// Otherwise it computes min/max statistics over `offsets` and `offsets + sizes`.
///
/// # Preconditions
///
/// The array must contain at least one list (`len() > 0`).
fn referenced_element_bounds(&self, ctx: &mut ExecutionCtx) -> VortexResult<(usize, usize)> {
let n_lists = self.as_ref().len();
vortex_ensure!(
n_lists > 0,
"referenced_element_bounds requires a non-empty array"
);

if self.is_zero_copy_to_list() {
let start = self.offset_at(0);
let end = self.offset_at(n_lists - 1) + self.size_at(n_lists - 1);
return Ok((start, end));
}

let start = self
.offsets()
.statistics()
.compute_min::<usize>(ctx)
.vortex_expect("offsets must report a usize min statistic");

// Cast offsets and sizes to the widest integer type so that `offset + size` cannot overflow
// the narrower input width.
let wide_dtype = DType::from(if self.offsets().dtype().as_ptype().is_unsigned_int() {
PType::U64
} else {
PType::I64
});
let offsets = self.offsets().cast(wide_dtype.clone())?;
let sizes = self.sizes().cast(wide_dtype)?;
let end = min_max(&offsets.binary(sizes, Operator::Add)?, ctx)?
.vortex_expect("non-empty array must report a min/max")
.max
.as_primitive()
.as_::<usize>()
.vortex_expect("max `offset + size` must fit in a usize");

Ok((start, end))
}
}
impl<T: TypedArrayRef<ListView>> ListViewArrayExt for T {}

Expand Down
1 change: 1 addition & 0 deletions vortex-array/src/arrays/listview/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ pub use conversion::recursive_list_from_list_view;

mod rebuild;
pub use rebuild::DEFAULT_REBUILD_DENSITY_THRESHOLD;
pub use rebuild::DEFAULT_TRIM_ELEMENTS_THRESHOLD;
pub use rebuild::ListViewRebuildMode;

#[cfg(test)]
Expand Down
79 changes: 28 additions & 51 deletions vortex-array/src/arrays/listview/rebuild.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,13 @@ use crate::LEGACY_SESSION;
#[expect(deprecated)]
use crate::ToCanonical as _;
use crate::VortexSessionExecute;
use crate::aggregate_fn::fns::min_max::min_max;
use crate::arrays::ConstantArray;
use crate::arrays::ListViewArray;
use crate::arrays::PrimitiveArray;
use crate::arrays::listview::ListViewArrayExt;
use crate::arrays::primitive::PrimitiveArrayExt;
use crate::builders::builder_with_capacity;
use crate::builtins::ArrayBuiltins;
use crate::dtype::DType;
use crate::dtype::IntegerPType;
use crate::dtype::Nullability;
use crate::dtype::PType;
Expand Down Expand Up @@ -54,6 +52,16 @@ fn rebuilt_offset_ptype(offsets_ptype: PType) -> PType {
/// list element dtypes.
pub const DEFAULT_REBUILD_DENSITY_THRESHOLD: f32 = 0.1;

/// Waste threshold to decide whether to trim a zero-copy-to-list `ListViewArray`.
///
/// A zero-copy-to-list array has no overlaps and no interior gaps, so its only unreferenced bytes
/// are leading and trailing elements. Trimming those is much cheaper than a full rebuild (a lazy
/// `elements` slice plus an `O(num_lists)` offset adjustment, with no element data copy), so we use
/// a more aggressive threshold than [`DEFAULT_REBUILD_DENSITY_THRESHOLD`].
///
/// When the unreferenced (leading + trailing) fraction of `elements` exceeds this threshold, we trim.
pub const DEFAULT_TRIM_ELEMENTS_THRESHOLD: f32 = 0.05;

/// Modes for rebuilding a [`ListViewArray`].
pub enum ListViewRebuildMode {
/// Removes all unused data and flattens out all list data, such that the array is zero-copyable
Expand All @@ -67,6 +75,9 @@ pub enum ListViewRebuildMode {

/// Removes any leading or trailing elements that are unused / not referenced by any views in
/// the [`ListViewArray`].
///
/// If the referenced `[start, end)` bounds are already known, prefer calling
/// [`trim_elements`](ListViewArray::trim_elements) directly to avoid recomputing them.
TrimElements,

/// Equivalent to `MakeZeroCopyToList` plus `TrimElements`.
Expand Down Expand Up @@ -335,58 +346,24 @@ impl ListViewArray {

/// Rebuilds a [`ListViewArray`] by trimming any unused / unreferenced leading and trailing
/// elements, which is defined as a contiguous run of values in the `elements` array that are
/// not referecened by any views in the corresponding [`ListViewArray`].
/// not referenced by any views in the corresponding [`ListViewArray`].
fn rebuild_trim_elements(&self) -> VortexResult<ListViewArray> {
let start = if self.is_zero_copy_to_list() {
// If offsets are sorted, then the minimum offset is the first offset.
// Note that even if the first view is null, offsets must always be valid, so it is
// completely fine for us to use this as a lower-bounded start of the `elements`.
self.offset_at(0)
} else {
let mut ctx = LEGACY_SESSION.create_execution_ctx();
self.offsets()
.statistics()
.compute_min(&mut ctx)
.vortex_expect(
"[ListViewArray::rebuild]: `offsets` must report min statistic that is a `usize`",
)
};

let end = if self.is_zero_copy_to_list() {
// If offsets are sorted and there are no overlaps (views are always "increasing"), we
// can just grab the last offset and last size.
let last_offset = self.offset_at(self.len() - 1);
let last_size = self.size_at(self.len() - 1);
last_offset + last_size
} else {
// Cast offsets and sizes to the widest integer type to prevent
// overflow when computing offsets + sizes. The end offset may not
// fit in the integer width otherwise.
let wide_dtype = DType::from(if self.offsets().dtype().as_ptype().is_unsigned_int() {
PType::U64
} else {
PType::I64
});
let offsets = self.offsets().cast(wide_dtype.clone())?;
let sizes = self.sizes().cast(wide_dtype)?;

let mut ctx = LEGACY_SESSION.create_execution_ctx();
let min_max = min_max(
&offsets
.binary(sizes, Operator::Add)
.vortex_expect("`offsets + sizes` somehow overflowed"),
&mut ctx,
)
.vortex_expect("Something went wrong while computing min and max")
.vortex_expect("We checked that the array was not empty in the top-level `rebuild`");
let mut ctx = LEGACY_SESSION.create_execution_ctx();
let (start, end) = self.referenced_element_bounds(&mut ctx)?;

min_max
.max
.as_primitive()
.as_::<usize>()
.vortex_expect("unable to interpret the max `offset + size` as a `usize`")
};
// SAFETY: we calculated valid start and end bounds
unsafe { self.trim_elements(start, end) }
}

/// Unsafely trims `elements` to the referenced half-open range `[start, end)`, adjusting every offset
/// down by `start`. The result preserves the original `is_zero_copy_to_list` flag.
///
/// # SAFETY
///
/// `start` must be the minimum value of `offsets`, and end should be the maximum value of `offsets[i] + size[i]`
/// over all indices `i` of offsets. Otherwise, `offsets` and `sizes` may hold references to elements that no longer exist
/// and the array will be corrupted.
pub unsafe fn trim_elements(&self, start: usize, end: usize) -> VortexResult<ListViewArray> {
let adjusted_offsets = match_each_integer_ptype!(self.offsets().dtype().as_ptype(), |O| {
let offset = <O as FromPrimitive>::from_usize(start)
.vortex_expect("unable to convert the min offset `start` into a `usize`");
Expand Down
40 changes: 40 additions & 0 deletions vortex-array/src/arrays/listview/tests/density.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
//! Tests for `compute_referenced_elements_mask`, `compute_density`, and
//! `estimate_density` on `ListViewArray`.

use vortex_buffer::buffer;
use vortex_error::VortexResult;
use vortex_mask::Mask;
use vortex_session::VortexSession;
Expand All @@ -14,13 +15,16 @@ use super::common::create_large_listview;
use super::common::create_overlapping_listview;
use super::common::create_sparse_overlapping_listview;
use crate::ExecutionCtx;
use crate::IntoArray;
use crate::VortexSessionExecute;
use crate::arrays::ListViewArray;
use crate::arrays::listview::ListViewArrayExt;
use crate::arrays::listview::tests::common::create_empty_elements_listview;
use crate::expr::stats::Precision;
use crate::expr::stats::Stat;
use crate::scalar::ScalarValue;
use crate::session::ArraySession;
use crate::validity::Validity;

const EPS: f32 = 1e-6;

Expand Down Expand Up @@ -137,3 +141,39 @@ fn referenced_mask_set_bits_match_views() -> VortexResult<()> {
assert!(bb.value(19));
Ok(())
}

#[test]
fn referenced_bounds_zero_copy_full_coverage() -> VortexResult<()> {
let mut ctx = test_execution_ctx();
// create_basic_listview references all 10 elements with no leading/trailing slack.
let lv = create_basic_listview();
assert_eq!(lv.referenced_element_bounds(&mut ctx)?, (0, 10));
Ok(())
}

#[test]
fn referenced_bounds_zero_copy_with_slack() -> VortexResult<()> {
let mut ctx = test_execution_ctx();
// 10 elements, views cover [2, 6): 2 leading + 4 trailing unreferenced.
let elements = buffer![0i32, 1, 2, 3, 4, 5, 6, 7, 8, 9].into_array();
let offsets = buffer![2u32, 4].into_array();
let sizes = buffer![2u32, 2].into_array();
let lv = unsafe {
ListViewArray::new_unchecked(elements, offsets, sizes, Validity::NonNullable)
.with_zero_copy_to_list(true)
};
assert_eq!(lv.referenced_element_bounds(&mut ctx)?, (2, 6));
Ok(())
}

#[test]
fn referenced_bounds_non_zero_copy() -> VortexResult<()> {
let mut ctx = test_execution_ctx();
let elements = buffer![0i32, 1, 2, 3, 4, 5, 6, 7, 8, 9].into_array();
let offsets = buffer![5u32, 2].into_array();
let sizes = buffer![2u32, 2].into_array();
let lv = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
assert!(!lv.is_zero_copy_to_list());
assert_eq!(lv.referenced_element_bounds(&mut ctx)?, (2, 7));
Ok(())
}
63 changes: 58 additions & 5 deletions vortex-array/src/arrow/executor/list_view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use crate::ExecutionCtx;
use crate::arrays::ListViewArray;
use crate::arrays::PrimitiveArray;
use crate::arrays::listview::DEFAULT_REBUILD_DENSITY_THRESHOLD;
use crate::arrays::listview::DEFAULT_TRIM_ELEMENTS_THRESHOLD;
use crate::arrays::listview::ListViewArrayExt;
use crate::arrays::listview::ListViewDataParts;
use crate::arrays::listview::ListViewRebuildMode;
Expand All @@ -31,11 +32,28 @@ pub(super) fn to_arrow_list_view<O: OffsetSizeTrait + IntegerPType>(
) -> VortexResult<arrow_array::ArrayRef> {
let array = array.execute::<ListViewArray>(ctx)?;

// If the array is sufficiently sparse, rebuild before handing it to Arrow. Otherwise downstream
// consumers hold an elements buffer containing unreferenced data in memory indefinitely,
// and any compute pass over that buffer wastes work on data nothing references.
let density = array.upper_bound_density(ctx)?;
let array = if density < DEFAULT_REBUILD_DENSITY_THRESHOLD {
// Reclaim unreferenced elements before handing the array to Arrow. Otherwise downstream
// consumers hold an elements buffer containing unreferenced data in memory indefinitely, and
// any compute pass over that buffer wastes work on data nothing references.
let array = if array.is_zero_copy_to_list() {
// A zctl array has no overlaps and no interior gaps, so the only unreferenced
// elements are leading and trailing. Trimming them is much cheaper than a full rebuild.
// Compute the referenced bounds once and reuse them for both the decision and the trim.
let n_elts = array.elements().len();
if n_elts == 0 || array.is_empty() {
array
} else {
let (start, end) = array.referenced_element_bounds(ctx)?;
let waste = (n_elts - (end - start)) as f32 / n_elts as f32;
if waste > DEFAULT_TRIM_ELEMENTS_THRESHOLD {
// SAFETY: we calculated valid start and end bounds
unsafe { array.trim_elements(start, end)? }
} else {
array
}
}
} else if array.upper_bound_density(ctx)? < DEFAULT_REBUILD_DENSITY_THRESHOLD {
// Overlaps, gaps, or garbage may be present, so a full rebuild is needed to reclaim waste.
array.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?
} else {
array
Expand Down Expand Up @@ -106,6 +124,41 @@ mod tests {
use crate::arrow::executor::list_view::PrimitiveArray;
use crate::validity::Validity;

#[test]
fn trims_zero_copy_with_significant_trailing_waste() -> VortexResult<()> {
let mut ctx = LEGACY_SESSION.create_execution_ctx();
// Zero-copy-to-list array with 10 elements but only [0, 4) referenced -> 60% waste.
// The conversion should trim the elements buffer down to the referenced range.
let elements = PrimitiveArray::new(
buffer![0i32, 1, 2, 3, 4, 5, 6, 7, 8, 9],
Validity::NonNullable,
);
let offsets = PrimitiveArray::new(buffer![0i32, 2], Validity::NonNullable);
let sizes = PrimitiveArray::new(buffer![2i32, 2], Validity::NonNullable);
let list_array = unsafe {
ListViewArray::new_unchecked(
elements.into_array(),
offsets.into_array(),
sizes.into_array(),
Validity::NonNullable,
)
.with_zero_copy_to_list(true)
};

let field = Field::new("item", DataType::Int32, false);
let arrow_dt = DataType::ListView(field.into());
let arrow_array = list_array
.into_array()
.execute_arrow(Some(&arrow_dt), &mut ctx)?;

let listview = arrow_array
.as_any()
.downcast_ref::<GenericListViewArray<i32>>()
.unwrap();
assert_eq!(listview.values().len(), 4);
Ok(())
}

#[test]
fn test_to_arrow_listview_i32() -> VortexResult<()> {
let mut ctx = LEGACY_SESSION.create_execution_ctx();
Expand Down
22 changes: 20 additions & 2 deletions vortex-duckdb/src/exporter/list_view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use vortex::array::ExecutionCtx;
use vortex::array::arrays::ListViewArray;
use vortex::array::arrays::PrimitiveArray;
use vortex::array::arrays::listview::DEFAULT_REBUILD_DENSITY_THRESHOLD;
use vortex::array::arrays::listview::DEFAULT_TRIM_ELEMENTS_THRESHOLD;
use vortex::array::arrays::listview::ListViewArrayExt;
use vortex::array::arrays::listview::ListViewDataParts;
use vortex::array::arrays::listview::ListViewRebuildMode;
Expand Down Expand Up @@ -55,8 +56,25 @@ pub(crate) fn new_exporter(
// If the array is sufficiently sparse, rebuild. Otherwise the DuckDB vector will
// hold an elements buffer containing unreferenced data in memory indefinitely,
// and any compute pass over that buffer wastes work on data nothing references.
let density = array.upper_bound_density(ctx)?;
let array = if density < DEFAULT_REBUILD_DENSITY_THRESHOLD {
let array = if array.is_zero_copy_to_list() {
// A zctl array has no overlaps and no interior gaps, so the only unreferenced
// elements are leading and trailing. Trimming them is much cheaper than a full rebuild.
// Compute the referenced bounds once and reuse them for both the decision and the trim.
let n_elts = array.elements().len();
if n_elts == 0 || array.is_empty() {
array
} else {
let (start, end) = array.referenced_element_bounds(ctx)?;
let waste = (n_elts - (end - start)) as f32 / n_elts as f32;
if waste > DEFAULT_TRIM_ELEMENTS_THRESHOLD {
// SAFETY: we calculated valid start and end bounds
unsafe { array.trim_elements(start, end)? }
} else {
array
}
}
} else if array.upper_bound_density(ctx)? < DEFAULT_REBUILD_DENSITY_THRESHOLD {
// Overlaps, gaps, or garbage may be present, so a full rebuild is needed to reclaim waste.
array.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?
} else {
array
Expand Down
Loading