diff --git a/vortex-array/src/arrays/listview/array.rs b/vortex-array/src/arrays/listview/array.rs index 64e59d1687f..e5b6f9aaf99 100644 --- a/vortex-array/src/arrays/listview/array.rs +++ b/vortex-array/src/arrays/listview/array.rs @@ -22,6 +22,7 @@ use crate::LEGACY_SESSION; #[expect(deprecated)] use crate::ToCanonical as _; use crate::VortexSessionExecute; +use crate::aggregate_fn::fns::min_max::min_max; use crate::array::Array; use crate::array::ArrayParts; use crate::array::TypedArrayRef; @@ -32,11 +33,14 @@ use crate::arrays::Primitive; use crate::arrays::PrimitiveArray; use crate::arrays::bool; use crate::arrays::primitive::PrimitiveArrayExt; +use crate::builtins::ArrayBuiltins; use crate::dtype::DType; use crate::dtype::IntegerPType; +use crate::dtype::PType; use crate::expr::stats::Stat; use crate::match_each_integer_ptype; use crate::match_each_unsigned_integer_ptype; +use crate::scalar_fn::fns::operators::Operator; use crate::validity::Validity; /// The `elements` data array, where each list scalar is a _slice_ of the `elements` array, and @@ -522,6 +526,56 @@ pub trait ListViewArrayExt: TypedArrayRef { Ok(estimate) } + + /// Returns the half-open range `[start, end)` of `elements` indices referenced by any view: + /// the minimum offset and the maximum `offset + size`. Elements outside this range are + /// unreferenced leading or trailing slack that a + /// [`TrimElements`](super::ListViewRebuildMode::TrimElements) rebuild would reclaim. + /// + /// For **zero-copy-to-list** arrays this is `O(1)`: views are sorted and non-overlapping with + /// no interior gaps, so the bounds are exactly `[first_offset, last_offset + last_size)`. + /// Otherwise it computes min/max statistics over `offsets` and `offsets + sizes`. + /// + /// # Preconditions + /// + /// The array must contain at least one list (`len() > 0`). + fn referenced_element_bounds(&self, ctx: &mut ExecutionCtx) -> VortexResult<(usize, usize)> { + let n_lists = self.as_ref().len(); + vortex_ensure!( + n_lists > 0, + "referenced_element_bounds requires a non-empty array" + ); + + if self.is_zero_copy_to_list() { + let start = self.offset_at(0); + let end = self.offset_at(n_lists - 1) + self.size_at(n_lists - 1); + return Ok((start, end)); + } + + let start = self + .offsets() + .statistics() + .compute_min::(ctx) + .vortex_expect("offsets must report a usize min statistic"); + + // Cast offsets and sizes to the widest integer type so that `offset + size` cannot overflow + // the narrower input width. + let wide_dtype = DType::from(if self.offsets().dtype().as_ptype().is_unsigned_int() { + PType::U64 + } else { + PType::I64 + }); + let offsets = self.offsets().cast(wide_dtype.clone())?; + let sizes = self.sizes().cast(wide_dtype)?; + let end = min_max(&offsets.binary(sizes, Operator::Add)?, ctx)? + .vortex_expect("non-empty array must report a min/max") + .max + .as_primitive() + .as_::() + .vortex_expect("max `offset + size` must fit in a usize"); + + Ok((start, end)) + } } impl> ListViewArrayExt for T {} diff --git a/vortex-array/src/arrays/listview/mod.rs b/vortex-array/src/arrays/listview/mod.rs index 29dc50f9b9f..d5674372b2c 100644 --- a/vortex-array/src/arrays/listview/mod.rs +++ b/vortex-array/src/arrays/listview/mod.rs @@ -19,6 +19,7 @@ pub use conversion::recursive_list_from_list_view; mod rebuild; pub use rebuild::DEFAULT_REBUILD_DENSITY_THRESHOLD; +pub use rebuild::DEFAULT_TRIM_ELEMENTS_THRESHOLD; pub use rebuild::ListViewRebuildMode; #[cfg(test)] diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs index 21dbcd70cee..2bcb4ef0c1f 100644 --- a/vortex-array/src/arrays/listview/rebuild.rs +++ b/vortex-array/src/arrays/listview/rebuild.rs @@ -11,7 +11,6 @@ use crate::LEGACY_SESSION; #[expect(deprecated)] use crate::ToCanonical as _; use crate::VortexSessionExecute; -use crate::aggregate_fn::fns::min_max::min_max; use crate::arrays::ConstantArray; use crate::arrays::ListViewArray; use crate::arrays::PrimitiveArray; @@ -19,7 +18,6 @@ use crate::arrays::listview::ListViewArrayExt; use crate::arrays::primitive::PrimitiveArrayExt; use crate::builders::builder_with_capacity; use crate::builtins::ArrayBuiltins; -use crate::dtype::DType; use crate::dtype::IntegerPType; use crate::dtype::Nullability; use crate::dtype::PType; @@ -54,6 +52,16 @@ fn rebuilt_offset_ptype(offsets_ptype: PType) -> PType { /// list element dtypes. pub const DEFAULT_REBUILD_DENSITY_THRESHOLD: f32 = 0.1; +/// Waste threshold to decide whether to trim a zero-copy-to-list `ListViewArray`. +/// +/// A zero-copy-to-list array has no overlaps and no interior gaps, so its only unreferenced bytes +/// are leading and trailing elements. Trimming those is much cheaper than a full rebuild (a lazy +/// `elements` slice plus an `O(num_lists)` offset adjustment, with no element data copy), so we use +/// a more aggressive threshold than [`DEFAULT_REBUILD_DENSITY_THRESHOLD`]. +/// +/// When the unreferenced (leading + trailing) fraction of `elements` exceeds this threshold, we trim. +pub const DEFAULT_TRIM_ELEMENTS_THRESHOLD: f32 = 0.05; + /// Modes for rebuilding a [`ListViewArray`]. pub enum ListViewRebuildMode { /// Removes all unused data and flattens out all list data, such that the array is zero-copyable @@ -67,6 +75,9 @@ pub enum ListViewRebuildMode { /// Removes any leading or trailing elements that are unused / not referenced by any views in /// the [`ListViewArray`]. + /// + /// If the referenced `[start, end)` bounds are already known, prefer calling + /// [`trim_elements`](ListViewArray::trim_elements) directly to avoid recomputing them. TrimElements, /// Equivalent to `MakeZeroCopyToList` plus `TrimElements`. @@ -335,58 +346,24 @@ impl ListViewArray { /// Rebuilds a [`ListViewArray`] by trimming any unused / unreferenced leading and trailing /// elements, which is defined as a contiguous run of values in the `elements` array that are - /// not referecened by any views in the corresponding [`ListViewArray`]. + /// not referenced by any views in the corresponding [`ListViewArray`]. fn rebuild_trim_elements(&self) -> VortexResult { - let start = if self.is_zero_copy_to_list() { - // If offsets are sorted, then the minimum offset is the first offset. - // Note that even if the first view is null, offsets must always be valid, so it is - // completely fine for us to use this as a lower-bounded start of the `elements`. - self.offset_at(0) - } else { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - self.offsets() - .statistics() - .compute_min(&mut ctx) - .vortex_expect( - "[ListViewArray::rebuild]: `offsets` must report min statistic that is a `usize`", - ) - }; - - let end = if self.is_zero_copy_to_list() { - // If offsets are sorted and there are no overlaps (views are always "increasing"), we - // can just grab the last offset and last size. - let last_offset = self.offset_at(self.len() - 1); - let last_size = self.size_at(self.len() - 1); - last_offset + last_size - } else { - // Cast offsets and sizes to the widest integer type to prevent - // overflow when computing offsets + sizes. The end offset may not - // fit in the integer width otherwise. - let wide_dtype = DType::from(if self.offsets().dtype().as_ptype().is_unsigned_int() { - PType::U64 - } else { - PType::I64 - }); - let offsets = self.offsets().cast(wide_dtype.clone())?; - let sizes = self.sizes().cast(wide_dtype)?; - - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - let min_max = min_max( - &offsets - .binary(sizes, Operator::Add) - .vortex_expect("`offsets + sizes` somehow overflowed"), - &mut ctx, - ) - .vortex_expect("Something went wrong while computing min and max") - .vortex_expect("We checked that the array was not empty in the top-level `rebuild`"); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let (start, end) = self.referenced_element_bounds(&mut ctx)?; - min_max - .max - .as_primitive() - .as_::() - .vortex_expect("unable to interpret the max `offset + size` as a `usize`") - }; + // SAFETY: we calculated valid start and end bounds + unsafe { self.trim_elements(start, end) } + } + /// Unsafely trims `elements` to the referenced half-open range `[start, end)`, adjusting every offset + /// down by `start`. The result preserves the original `is_zero_copy_to_list` flag. + /// + /// # SAFETY + /// + /// `start` must be the minimum value of `offsets`, and end should be the maximum value of `offsets[i] + size[i]` + /// over all indices `i` of offsets. Otherwise, `offsets` and `sizes` may hold references to elements that no longer exist + /// and the array will be corrupted. + pub unsafe fn trim_elements(&self, start: usize, end: usize) -> VortexResult { let adjusted_offsets = match_each_integer_ptype!(self.offsets().dtype().as_ptype(), |O| { let offset = ::from_usize(start) .vortex_expect("unable to convert the min offset `start` into a `usize`"); diff --git a/vortex-array/src/arrays/listview/tests/density.rs b/vortex-array/src/arrays/listview/tests/density.rs index 3799bc9612c..ad408f650b3 100644 --- a/vortex-array/src/arrays/listview/tests/density.rs +++ b/vortex-array/src/arrays/listview/tests/density.rs @@ -4,6 +4,7 @@ //! Tests for `compute_referenced_elements_mask`, `compute_density`, and //! `estimate_density` on `ListViewArray`. +use vortex_buffer::buffer; use vortex_error::VortexResult; use vortex_mask::Mask; use vortex_session::VortexSession; @@ -14,13 +15,16 @@ use super::common::create_large_listview; use super::common::create_overlapping_listview; use super::common::create_sparse_overlapping_listview; use crate::ExecutionCtx; +use crate::IntoArray; use crate::VortexSessionExecute; +use crate::arrays::ListViewArray; use crate::arrays::listview::ListViewArrayExt; use crate::arrays::listview::tests::common::create_empty_elements_listview; use crate::expr::stats::Precision; use crate::expr::stats::Stat; use crate::scalar::ScalarValue; use crate::session::ArraySession; +use crate::validity::Validity; const EPS: f32 = 1e-6; @@ -137,3 +141,39 @@ fn referenced_mask_set_bits_match_views() -> VortexResult<()> { assert!(bb.value(19)); Ok(()) } + +#[test] +fn referenced_bounds_zero_copy_full_coverage() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); + // create_basic_listview references all 10 elements with no leading/trailing slack. + let lv = create_basic_listview(); + assert_eq!(lv.referenced_element_bounds(&mut ctx)?, (0, 10)); + Ok(()) +} + +#[test] +fn referenced_bounds_zero_copy_with_slack() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); + // 10 elements, views cover [2, 6): 2 leading + 4 trailing unreferenced. + let elements = buffer![0i32, 1, 2, 3, 4, 5, 6, 7, 8, 9].into_array(); + let offsets = buffer![2u32, 4].into_array(); + let sizes = buffer![2u32, 2].into_array(); + let lv = unsafe { + ListViewArray::new_unchecked(elements, offsets, sizes, Validity::NonNullable) + .with_zero_copy_to_list(true) + }; + assert_eq!(lv.referenced_element_bounds(&mut ctx)?, (2, 6)); + Ok(()) +} + +#[test] +fn referenced_bounds_non_zero_copy() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); + let elements = buffer![0i32, 1, 2, 3, 4, 5, 6, 7, 8, 9].into_array(); + let offsets = buffer![5u32, 2].into_array(); + let sizes = buffer![2u32, 2].into_array(); + let lv = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable); + assert!(!lv.is_zero_copy_to_list()); + assert_eq!(lv.referenced_element_bounds(&mut ctx)?, (2, 7)); + Ok(()) +} diff --git a/vortex-array/src/arrow/executor/list_view.rs b/vortex-array/src/arrow/executor/list_view.rs index 53d06863557..b77d079f509 100644 --- a/vortex-array/src/arrow/executor/list_view.rs +++ b/vortex-array/src/arrow/executor/list_view.rs @@ -14,6 +14,7 @@ use crate::ExecutionCtx; use crate::arrays::ListViewArray; use crate::arrays::PrimitiveArray; use crate::arrays::listview::DEFAULT_REBUILD_DENSITY_THRESHOLD; +use crate::arrays::listview::DEFAULT_TRIM_ELEMENTS_THRESHOLD; use crate::arrays::listview::ListViewArrayExt; use crate::arrays::listview::ListViewDataParts; use crate::arrays::listview::ListViewRebuildMode; @@ -31,11 +32,28 @@ pub(super) fn to_arrow_list_view( ) -> VortexResult { let array = array.execute::(ctx)?; - // If the array is sufficiently sparse, rebuild before handing it to Arrow. Otherwise downstream - // consumers hold an elements buffer containing unreferenced data in memory indefinitely, - // and any compute pass over that buffer wastes work on data nothing references. - let density = array.upper_bound_density(ctx)?; - let array = if density < DEFAULT_REBUILD_DENSITY_THRESHOLD { + // Reclaim unreferenced elements before handing the array to Arrow. Otherwise downstream + // consumers hold an elements buffer containing unreferenced data in memory indefinitely, and + // any compute pass over that buffer wastes work on data nothing references. + let array = if array.is_zero_copy_to_list() { + // A zctl array has no overlaps and no interior gaps, so the only unreferenced + // elements are leading and trailing. Trimming them is much cheaper than a full rebuild. + // Compute the referenced bounds once and reuse them for both the decision and the trim. + let n_elts = array.elements().len(); + if n_elts == 0 || array.is_empty() { + array + } else { + let (start, end) = array.referenced_element_bounds(ctx)?; + let waste = (n_elts - (end - start)) as f32 / n_elts as f32; + if waste > DEFAULT_TRIM_ELEMENTS_THRESHOLD { + // SAFETY: we calculated valid start and end bounds + unsafe { array.trim_elements(start, end)? } + } else { + array + } + } + } else if array.upper_bound_density(ctx)? < DEFAULT_REBUILD_DENSITY_THRESHOLD { + // Overlaps, gaps, or garbage may be present, so a full rebuild is needed to reclaim waste. array.rebuild(ListViewRebuildMode::MakeZeroCopyToList)? } else { array @@ -106,6 +124,41 @@ mod tests { use crate::arrow::executor::list_view::PrimitiveArray; use crate::validity::Validity; + #[test] + fn trims_zero_copy_with_significant_trailing_waste() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + // Zero-copy-to-list array with 10 elements but only [0, 4) referenced -> 60% waste. + // The conversion should trim the elements buffer down to the referenced range. + let elements = PrimitiveArray::new( + buffer![0i32, 1, 2, 3, 4, 5, 6, 7, 8, 9], + Validity::NonNullable, + ); + let offsets = PrimitiveArray::new(buffer![0i32, 2], Validity::NonNullable); + let sizes = PrimitiveArray::new(buffer![2i32, 2], Validity::NonNullable); + let list_array = unsafe { + ListViewArray::new_unchecked( + elements.into_array(), + offsets.into_array(), + sizes.into_array(), + Validity::NonNullable, + ) + .with_zero_copy_to_list(true) + }; + + let field = Field::new("item", DataType::Int32, false); + let arrow_dt = DataType::ListView(field.into()); + let arrow_array = list_array + .into_array() + .execute_arrow(Some(&arrow_dt), &mut ctx)?; + + let listview = arrow_array + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!(listview.values().len(), 4); + Ok(()) + } + #[test] fn test_to_arrow_listview_i32() -> VortexResult<()> { let mut ctx = LEGACY_SESSION.create_execution_ctx(); diff --git a/vortex-duckdb/src/exporter/list_view.rs b/vortex-duckdb/src/exporter/list_view.rs index 816294bb923..a0ee1614001 100644 --- a/vortex-duckdb/src/exporter/list_view.rs +++ b/vortex-duckdb/src/exporter/list_view.rs @@ -11,6 +11,7 @@ use vortex::array::ExecutionCtx; use vortex::array::arrays::ListViewArray; use vortex::array::arrays::PrimitiveArray; use vortex::array::arrays::listview::DEFAULT_REBUILD_DENSITY_THRESHOLD; +use vortex::array::arrays::listview::DEFAULT_TRIM_ELEMENTS_THRESHOLD; use vortex::array::arrays::listview::ListViewArrayExt; use vortex::array::arrays::listview::ListViewDataParts; use vortex::array::arrays::listview::ListViewRebuildMode; @@ -55,8 +56,25 @@ pub(crate) fn new_exporter( // If the array is sufficiently sparse, rebuild. Otherwise the DuckDB vector will // hold an elements buffer containing unreferenced data in memory indefinitely, // and any compute pass over that buffer wastes work on data nothing references. - let density = array.upper_bound_density(ctx)?; - let array = if density < DEFAULT_REBUILD_DENSITY_THRESHOLD { + let array = if array.is_zero_copy_to_list() { + // A zctl array has no overlaps and no interior gaps, so the only unreferenced + // elements are leading and trailing. Trimming them is much cheaper than a full rebuild. + // Compute the referenced bounds once and reuse them for both the decision and the trim. + let n_elts = array.elements().len(); + if n_elts == 0 || array.is_empty() { + array + } else { + let (start, end) = array.referenced_element_bounds(ctx)?; + let waste = (n_elts - (end - start)) as f32 / n_elts as f32; + if waste > DEFAULT_TRIM_ELEMENTS_THRESHOLD { + // SAFETY: we calculated valid start and end bounds + unsafe { array.trim_elements(start, end)? } + } else { + array + } + } + } else if array.upper_bound_density(ctx)? < DEFAULT_REBUILD_DENSITY_THRESHOLD { + // Overlaps, gaps, or garbage may be present, so a full rebuild is needed to reclaim waste. array.rebuild(ListViewRebuildMode::MakeZeroCopyToList)? } else { array