From f4be5efea589903970baffc675b42a375b3b4f8e Mon Sep 17 00:00:00 2001 From: Matt Katz Date: Fri, 29 May 2026 13:02:06 -0400 Subject: [PATCH 1/6] trim zctl listview instead of full rebuild Signed-off-by: Matt Katz Signed-off-by: Matt Katz --- vortex-array/src/arrays/listview/array.rs | 24 ++++++++ vortex-array/src/arrays/listview/mod.rs | 1 + vortex-array/src/arrays/listview/rebuild.rs | 10 ++++ .../src/arrays/listview/tests/density.rs | 30 ++++++++++ vortex-array/src/arrow/executor/list_view.rs | 55 +++++++++++++++++-- 5 files changed, 115 insertions(+), 5 deletions(-) diff --git a/vortex-array/src/arrays/listview/array.rs b/vortex-array/src/arrays/listview/array.rs index 64e59d1687f..91eaa13b38b 100644 --- a/vortex-array/src/arrays/listview/array.rs +++ b/vortex-array/src/arrays/listview/array.rs @@ -522,6 +522,30 @@ pub trait ListViewArrayExt: TypedArrayRef { Ok(estimate) } + + /// Proportion of `elements` that lies before the first referenced element or after the last, + /// in `[0.0, 1.0]`. Computed in `O(1)` from the first and last views. + /// + /// This is the fraction of the `elements` buffer that a + /// [`TrimElements`](super::ListViewRebuildMode::TrimElements) rebuild would reclaim, but it is + /// only correct for **zero-copy-to-list** arrays. There, views are sorted and non-overlapping + /// with no interior gaps, so every unreferenced element is leading or trailing and the + /// referenced range is exactly `[first_offset, last_offset + last_size)`. + /// + /// Returns `0.0` when `elements` is empty or the array has no lists. + fn proportion_tail_unreferenced(&self) -> f32 { + let n_elts = self.elements().len(); + let n_lists = self.as_ref().len(); + if n_elts == 0 || n_lists == 0 { + return 0.0; + } + + let start = self.offset_at(0); + let end = self.offset_at(n_lists - 1) + self.size_at(n_lists - 1); + let referenced = end - start; + + (n_elts - referenced) as f32 / n_elts as f32 + } } impl> ListViewArrayExt for T {} diff --git a/vortex-array/src/arrays/listview/mod.rs b/vortex-array/src/arrays/listview/mod.rs index 29dc50f9b9f..526dba396c1 100644 --- a/vortex-array/src/arrays/listview/mod.rs +++ b/vortex-array/src/arrays/listview/mod.rs @@ -19,6 +19,7 @@ pub use conversion::recursive_list_from_list_view; mod rebuild; pub use rebuild::DEFAULT_REBUILD_DENSITY_THRESHOLD; +pub use rebuild::DEFAULT_TRIM_WASTE_THRESHOLD; pub use rebuild::ListViewRebuildMode; #[cfg(test)] diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs index 21dbcd70cee..22dce29e06e 100644 --- a/vortex-array/src/arrays/listview/rebuild.rs +++ b/vortex-array/src/arrays/listview/rebuild.rs @@ -54,6 +54,16 @@ fn rebuilt_offset_ptype(offsets_ptype: PType) -> PType { /// list element dtypes. pub const DEFAULT_REBUILD_DENSITY_THRESHOLD: f32 = 0.1; +/// Waste threshold to decide whether to trim a zero-copy-to-list `ListViewArray`. +/// +/// A zero-copy-to-list array has no overlaps and no interior gaps, so its only unreferenced bytes +/// are leading and trailing elements. Trimming those is much cheaper than a full rebuild (a lazy +/// `elements` slice plus an `O(num_lists)` offset adjustment, with no element data copy), so we use +/// a more aggressive threshold than [`DEFAULT_REBUILD_DENSITY_THRESHOLD`]. +/// +/// When the unreferenced (leading + trailing) fraction of `elements` exceeds this threshold, we trim. +pub const DEFAULT_TRIM_WASTE_THRESHOLD: f32 = 0.05; + /// Modes for rebuilding a [`ListViewArray`]. pub enum ListViewRebuildMode { /// Removes all unused data and flattens out all list data, such that the array is zero-copyable diff --git a/vortex-array/src/arrays/listview/tests/density.rs b/vortex-array/src/arrays/listview/tests/density.rs index 3799bc9612c..d94c3429e5d 100644 --- a/vortex-array/src/arrays/listview/tests/density.rs +++ b/vortex-array/src/arrays/listview/tests/density.rs @@ -4,6 +4,7 @@ //! Tests for `compute_referenced_elements_mask`, `compute_density`, and //! `estimate_density` on `ListViewArray`. +use vortex_buffer::buffer; use vortex_error::VortexResult; use vortex_mask::Mask; use vortex_session::VortexSession; @@ -14,13 +15,16 @@ use super::common::create_large_listview; use super::common::create_overlapping_listview; use super::common::create_sparse_overlapping_listview; use crate::ExecutionCtx; +use crate::IntoArray; use crate::VortexSessionExecute; +use crate::arrays::ListViewArray; use crate::arrays::listview::ListViewArrayExt; use crate::arrays::listview::tests::common::create_empty_elements_listview; use crate::expr::stats::Precision; use crate::expr::stats::Stat; use crate::scalar::ScalarValue; use crate::session::ArraySession; +use crate::validity::Validity; const EPS: f32 = 1e-6; @@ -137,3 +141,29 @@ fn referenced_mask_set_bits_match_views() -> VortexResult<()> { assert!(bb.value(19)); Ok(()) } + +#[test] +fn tail_unreferenced_zero_when_fully_referenced() { + // create_basic_listview references all 10 elements with no leading/trailing slack. + let lv = create_basic_listview(); + assert!(lv.proportion_tail_unreferenced().abs() < EPS); +} + +#[test] +fn tail_unreferenced_counts_leading_and_trailing() { + // 10 elements, views cover [2, 6): 2 leading + 4 trailing unreferenced -> 0.6. + let elements = buffer![0i32, 1, 2, 3, 4, 5, 6, 7, 8, 9].into_array(); + let offsets = buffer![2u32, 4].into_array(); + let sizes = buffer![2u32, 2].into_array(); + let lv = unsafe { + ListViewArray::new_unchecked(elements, offsets, sizes, Validity::NonNullable) + .with_zero_copy_to_list(true) + }; + assert!((lv.proportion_tail_unreferenced() - 0.6).abs() < EPS); +} + +#[test] +fn tail_unreferenced_zero_for_empty_elements() { + let lv = create_empty_elements_listview(); + assert_eq!(lv.proportion_tail_unreferenced(), 0.0); +} diff --git a/vortex-array/src/arrow/executor/list_view.rs b/vortex-array/src/arrow/executor/list_view.rs index 53d06863557..8bc836b59bf 100644 --- a/vortex-array/src/arrow/executor/list_view.rs +++ b/vortex-array/src/arrow/executor/list_view.rs @@ -14,6 +14,7 @@ use crate::ExecutionCtx; use crate::arrays::ListViewArray; use crate::arrays::PrimitiveArray; use crate::arrays::listview::DEFAULT_REBUILD_DENSITY_THRESHOLD; +use crate::arrays::listview::DEFAULT_TRIM_WASTE_THRESHOLD; use crate::arrays::listview::ListViewArrayExt; use crate::arrays::listview::ListViewDataParts; use crate::arrays::listview::ListViewRebuildMode; @@ -31,11 +32,20 @@ pub(super) fn to_arrow_list_view( ) -> VortexResult { let array = array.execute::(ctx)?; - // If the array is sufficiently sparse, rebuild before handing it to Arrow. Otherwise downstream - // consumers hold an elements buffer containing unreferenced data in memory indefinitely, - // and any compute pass over that buffer wastes work on data nothing references. - let density = array.upper_bound_density(ctx)?; - let array = if density < DEFAULT_REBUILD_DENSITY_THRESHOLD { + // Reclaim unreferenced elements before handing the array to Arrow. Otherwise downstream + // consumers hold an elements buffer containing unreferenced data in memory indefinitely, and + // any compute pass over that buffer wastes work on data nothing references. + let array = if array.is_zero_copy_to_list() { + // A zero-copy-to-list array has no overlaps and no interior gaps, so the only unreferenced + // elements are leading and trailing. Trimming them is much cheaper than a full rebuild, so + // detect the waste in O(1) and trim only when it is significant. + if array.proportion_tail_unreferenced() > DEFAULT_TRIM_WASTE_THRESHOLD { + array.rebuild(ListViewRebuildMode::TrimElements)? + } else { + array + } + } else if array.upper_bound_density(ctx)? < DEFAULT_REBUILD_DENSITY_THRESHOLD { + // Overlaps, gaps, or garbage may be present, so a full rebuild is needed to reclaim waste. array.rebuild(ListViewRebuildMode::MakeZeroCopyToList)? } else { array @@ -106,6 +116,41 @@ mod tests { use crate::arrow::executor::list_view::PrimitiveArray; use crate::validity::Validity; + #[test] + fn trims_zero_copy_with_significant_trailing_waste() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + // Zero-copy-to-list array with 10 elements but only [0, 4) referenced -> 60% waste. + // The conversion should trim the elements buffer down to the referenced range. + let elements = PrimitiveArray::new( + buffer![0i32, 1, 2, 3, 4, 5, 6, 7, 8, 9], + Validity::NonNullable, + ); + let offsets = PrimitiveArray::new(buffer![0i32, 2], Validity::NonNullable); + let sizes = PrimitiveArray::new(buffer![2i32, 2], Validity::NonNullable); + let list_array = unsafe { + ListViewArray::new_unchecked( + elements.into_array(), + offsets.into_array(), + sizes.into_array(), + Validity::NonNullable, + ) + .with_zero_copy_to_list(true) + }; + + let field = Field::new("item", DataType::Int32, false); + let arrow_dt = DataType::ListView(field.into()); + let arrow_array = list_array + .into_array() + .execute_arrow(Some(&arrow_dt), &mut ctx)?; + + let listview = arrow_array + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!(listview.values().len(), 4); + Ok(()) + } + #[test] fn test_to_arrow_listview_i32() -> VortexResult<()> { let mut ctx = LEGACY_SESSION.create_execution_ctx(); From 42ee8469d705c6f4ca2307d60cf49300f12a3cee Mon Sep 17 00:00:00 2001 From: Matt Katz Date: Fri, 29 May 2026 13:15:20 -0400 Subject: [PATCH 2/6] generalize prop_tail_unreferenced to non-zctl listviews Signed-off-by: Matt Katz Signed-off-by: Matt Katz --- vortex-array/src/arrays/listview/array.rs | 76 +++++++++++++++---- vortex-array/src/arrays/listview/rebuild.rs | 53 +------------ .../src/arrays/listview/tests/density.rs | 32 ++++++-- vortex-array/src/arrow/executor/list_view.rs | 2 +- 4 files changed, 92 insertions(+), 71 deletions(-) diff --git a/vortex-array/src/arrays/listview/array.rs b/vortex-array/src/arrays/listview/array.rs index 91eaa13b38b..33928b99a52 100644 --- a/vortex-array/src/arrays/listview/array.rs +++ b/vortex-array/src/arrays/listview/array.rs @@ -22,6 +22,7 @@ use crate::LEGACY_SESSION; #[expect(deprecated)] use crate::ToCanonical as _; use crate::VortexSessionExecute; +use crate::aggregate_fn::fns::min_max::min_max; use crate::array::Array; use crate::array::ArrayParts; use crate::array::TypedArrayRef; @@ -32,11 +33,14 @@ use crate::arrays::Primitive; use crate::arrays::PrimitiveArray; use crate::arrays::bool; use crate::arrays::primitive::PrimitiveArrayExt; +use crate::builtins::ArrayBuiltins; use crate::dtype::DType; use crate::dtype::IntegerPType; +use crate::dtype::PType; use crate::expr::stats::Stat; use crate::match_each_integer_ptype; use crate::match_each_unsigned_integer_ptype; +use crate::scalar_fn::fns::operators::Operator; use crate::validity::Validity; /// The `elements` data array, where each list scalar is a _slice_ of the `elements` array, and @@ -523,28 +527,74 @@ pub trait ListViewArrayExt: TypedArrayRef { Ok(estimate) } - /// Proportion of `elements` that lies before the first referenced element or after the last, - /// in `[0.0, 1.0]`. Computed in `O(1)` from the first and last views. + /// Returns the half-open range `[start, end)` of `elements` indices referenced by any view: + /// the minimum offset and the maximum `offset + size`. Elements outside this range are + /// unreferenced leading or trailing slack that a + /// [`TrimElements`](super::ListViewRebuildMode::TrimElements) rebuild would reclaim. + /// + /// For **zero-copy-to-list** arrays this is `O(1)`: views are sorted and non-overlapping with + /// no interior gaps, so the bounds are exactly `[first_offset, last_offset + last_size)`. + /// Otherwise it computes min/max statistics over `offsets` and `offsets + sizes`. + /// + /// # Preconditions + /// + /// The array must contain at least one list (`len() > 0`). + fn referenced_element_bounds(&self, ctx: &mut ExecutionCtx) -> VortexResult<(usize, usize)> { + let n_lists = self.as_ref().len(); + assert!( + n_lists > 0, + "referenced_element_bounds requires a non-empty array" + ); + + if self.is_zero_copy_to_list() { + let start = self.offset_at(0); + let end = self.offset_at(n_lists - 1) + self.size_at(n_lists - 1); + return Ok((start, end)); + } + + let start = self + .offsets() + .statistics() + .compute_min::(ctx) + .vortex_expect("offsets must report a usize min statistic"); + + // Cast offsets and sizes to the widest integer type so that `offset + size` cannot overflow + // the narrower input width. + let wide_dtype = DType::from(if self.offsets().dtype().as_ptype().is_unsigned_int() { + PType::U64 + } else { + PType::I64 + }); + let offsets = self.offsets().cast(wide_dtype.clone())?; + let sizes = self.sizes().cast(wide_dtype)?; + let end = min_max(&offsets.binary(sizes, Operator::Add)?, ctx)? + .vortex_expect("non-empty array must report a min/max") + .max + .as_primitive() + .as_::() + .vortex_expect("max `offset + size` must fit in a usize"); + + Ok((start, end)) + } + + /// Proportion of `elements` that is unreferenced leading or trailing slack, in `[0.0, 1.0]`. /// /// This is the fraction of the `elements` buffer that a - /// [`TrimElements`](super::ListViewRebuildMode::TrimElements) rebuild would reclaim, but it is - /// only correct for **zero-copy-to-list** arrays. There, views are sorted and non-overlapping - /// with no interior gaps, so every unreferenced element is leading or trailing and the - /// referenced range is exactly `[first_offset, last_offset + last_size)`. + /// [`TrimElements`](super::ListViewRebuildMode::TrimElements) rebuild would reclaim. It is + /// exact and `O(1)` for zero-copy-to-list arrays and otherwise computes min/max statistics + /// over the offsets (see [`referenced_element_bounds`](Self::referenced_element_bounds)). /// /// Returns `0.0` when `elements` is empty or the array has no lists. - fn proportion_tail_unreferenced(&self) -> f32 { + fn prop_tail_unreferenced(&self, ctx: &mut ExecutionCtx) -> VortexResult { let n_elts = self.elements().len(); - let n_lists = self.as_ref().len(); - if n_elts == 0 || n_lists == 0 { - return 0.0; + if n_elts == 0 || self.as_ref().is_empty() { + return Ok(0.0); } - let start = self.offset_at(0); - let end = self.offset_at(n_lists - 1) + self.size_at(n_lists - 1); + let (start, end) = self.referenced_element_bounds(ctx)?; let referenced = end - start; - (n_elts - referenced) as f32 / n_elts as f32 + Ok((n_elts - referenced) as f32 / n_elts as f32) } } impl> ListViewArrayExt for T {} diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs index 22dce29e06e..0d389c391d5 100644 --- a/vortex-array/src/arrays/listview/rebuild.rs +++ b/vortex-array/src/arrays/listview/rebuild.rs @@ -11,7 +11,6 @@ use crate::LEGACY_SESSION; #[expect(deprecated)] use crate::ToCanonical as _; use crate::VortexSessionExecute; -use crate::aggregate_fn::fns::min_max::min_max; use crate::arrays::ConstantArray; use crate::arrays::ListViewArray; use crate::arrays::PrimitiveArray; @@ -19,7 +18,6 @@ use crate::arrays::listview::ListViewArrayExt; use crate::arrays::primitive::PrimitiveArrayExt; use crate::builders::builder_with_capacity; use crate::builtins::ArrayBuiltins; -use crate::dtype::DType; use crate::dtype::IntegerPType; use crate::dtype::Nullability; use crate::dtype::PType; @@ -347,55 +345,8 @@ impl ListViewArray { /// elements, which is defined as a contiguous run of values in the `elements` array that are /// not referecened by any views in the corresponding [`ListViewArray`]. fn rebuild_trim_elements(&self) -> VortexResult { - let start = if self.is_zero_copy_to_list() { - // If offsets are sorted, then the minimum offset is the first offset. - // Note that even if the first view is null, offsets must always be valid, so it is - // completely fine for us to use this as a lower-bounded start of the `elements`. - self.offset_at(0) - } else { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - self.offsets() - .statistics() - .compute_min(&mut ctx) - .vortex_expect( - "[ListViewArray::rebuild]: `offsets` must report min statistic that is a `usize`", - ) - }; - - let end = if self.is_zero_copy_to_list() { - // If offsets are sorted and there are no overlaps (views are always "increasing"), we - // can just grab the last offset and last size. - let last_offset = self.offset_at(self.len() - 1); - let last_size = self.size_at(self.len() - 1); - last_offset + last_size - } else { - // Cast offsets and sizes to the widest integer type to prevent - // overflow when computing offsets + sizes. The end offset may not - // fit in the integer width otherwise. - let wide_dtype = DType::from(if self.offsets().dtype().as_ptype().is_unsigned_int() { - PType::U64 - } else { - PType::I64 - }); - let offsets = self.offsets().cast(wide_dtype.clone())?; - let sizes = self.sizes().cast(wide_dtype)?; - - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - let min_max = min_max( - &offsets - .binary(sizes, Operator::Add) - .vortex_expect("`offsets + sizes` somehow overflowed"), - &mut ctx, - ) - .vortex_expect("Something went wrong while computing min and max") - .vortex_expect("We checked that the array was not empty in the top-level `rebuild`"); - - min_max - .max - .as_primitive() - .as_::() - .vortex_expect("unable to interpret the max `offset + size` as a `usize`") - }; + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let (start, end) = self.referenced_element_bounds(&mut ctx)?; let adjusted_offsets = match_each_integer_ptype!(self.offsets().dtype().as_ptype(), |O| { let offset = ::from_usize(start) diff --git a/vortex-array/src/arrays/listview/tests/density.rs b/vortex-array/src/arrays/listview/tests/density.rs index d94c3429e5d..75648af0303 100644 --- a/vortex-array/src/arrays/listview/tests/density.rs +++ b/vortex-array/src/arrays/listview/tests/density.rs @@ -143,14 +143,17 @@ fn referenced_mask_set_bits_match_views() -> VortexResult<()> { } #[test] -fn tail_unreferenced_zero_when_fully_referenced() { +fn tail_unreferenced_zero_when_fully_referenced() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); // create_basic_listview references all 10 elements with no leading/trailing slack. let lv = create_basic_listview(); - assert!(lv.proportion_tail_unreferenced().abs() < EPS); + assert!(lv.prop_tail_unreferenced(&mut ctx)?.abs() < EPS); + Ok(()) } #[test] -fn tail_unreferenced_counts_leading_and_trailing() { +fn tail_unreferenced_counts_leading_and_trailing_zero_copy() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); // 10 elements, views cover [2, 6): 2 leading + 4 trailing unreferenced -> 0.6. let elements = buffer![0i32, 1, 2, 3, 4, 5, 6, 7, 8, 9].into_array(); let offsets = buffer![2u32, 4].into_array(); @@ -159,11 +162,28 @@ fn tail_unreferenced_counts_leading_and_trailing() { ListViewArray::new_unchecked(elements, offsets, sizes, Validity::NonNullable) .with_zero_copy_to_list(true) }; - assert!((lv.proportion_tail_unreferenced() - 0.6).abs() < EPS); + assert!((lv.prop_tail_unreferenced(&mut ctx)? - 0.6).abs() < EPS); + Ok(()) } #[test] -fn tail_unreferenced_zero_for_empty_elements() { +fn tail_unreferenced_non_zero_copy_uses_stats() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); + // Out-of-order offsets -> not zero-copy-to-list, so bounds come from min/max stats. + // 10 elements, views [5,7) and [2,4): min offset 2, max end 7 -> referenced [2, 7) -> 0.5. + let elements = buffer![0i32, 1, 2, 3, 4, 5, 6, 7, 8, 9].into_array(); + let offsets = buffer![5u32, 2].into_array(); + let sizes = buffer![2u32, 2].into_array(); + let lv = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable); + assert!(!lv.is_zero_copy_to_list()); + assert!((lv.prop_tail_unreferenced(&mut ctx)? - 0.5).abs() < EPS); + Ok(()) +} + +#[test] +fn tail_unreferenced_zero_for_empty_elements() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); let lv = create_empty_elements_listview(); - assert_eq!(lv.proportion_tail_unreferenced(), 0.0); + assert_eq!(lv.prop_tail_unreferenced(&mut ctx)?, 0.0); + Ok(()) } diff --git a/vortex-array/src/arrow/executor/list_view.rs b/vortex-array/src/arrow/executor/list_view.rs index 8bc836b59bf..5e5cffd5a7b 100644 --- a/vortex-array/src/arrow/executor/list_view.rs +++ b/vortex-array/src/arrow/executor/list_view.rs @@ -39,7 +39,7 @@ pub(super) fn to_arrow_list_view( // A zero-copy-to-list array has no overlaps and no interior gaps, so the only unreferenced // elements are leading and trailing. Trimming them is much cheaper than a full rebuild, so // detect the waste in O(1) and trim only when it is significant. - if array.proportion_tail_unreferenced() > DEFAULT_TRIM_WASTE_THRESHOLD { + if array.prop_tail_unreferenced(ctx)? > DEFAULT_TRIM_WASTE_THRESHOLD { array.rebuild(ListViewRebuildMode::TrimElements)? } else { array From 312337aa3604b04a792cee3521c88d0be8be24ac Mon Sep 17 00:00:00 2001 From: Matt Katz Date: Fri, 29 May 2026 13:44:24 -0400 Subject: [PATCH 3/6] add trim_elements(start, end) to reuse known bounds Signed-off-by: Matt Katz Signed-off-by: Matt Katz --- vortex-array/src/arrays/listview/rebuild.rs | 15 +++++++++++++++ vortex-array/src/arrow/executor/list_view.rs | 17 ++++++++++++----- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs index 0d389c391d5..841470e02f9 100644 --- a/vortex-array/src/arrays/listview/rebuild.rs +++ b/vortex-array/src/arrays/listview/rebuild.rs @@ -75,6 +75,9 @@ pub enum ListViewRebuildMode { /// Removes any leading or trailing elements that are unused / not referenced by any views in /// the [`ListViewArray`]. + /// + /// If the referenced `[start, end)` bounds are already known, prefer calling + /// [`trim_elements`](ListViewArray::trim_elements) directly to avoid recomputing them. TrimElements, /// Equivalent to `MakeZeroCopyToList` plus `TrimElements`. @@ -347,7 +350,19 @@ impl ListViewArray { fn rebuild_trim_elements(&self) -> VortexResult { let mut ctx = LEGACY_SESSION.create_execution_ctx(); let (start, end) = self.referenced_element_bounds(&mut ctx)?; + self.trim_elements(start, end) + } + /// Trims `elements` to the referenced half-open range `[start, end)`, adjusting every offset + /// down by `start`. The result preserves the original `is_zero_copy_to_list` flag. + /// + /// `start` and `end` MUST be the referenced-element bounds from + /// [`referenced_element_bounds`](ListViewArrayExt::referenced_element_bounds); any other range + /// yields a logically corrupt array. Prefer `rebuild(`[`TrimElements`]`)` unless the bounds are + /// already known, in which case this entry point avoids recomputing them. + /// + /// [`TrimElements`]: ListViewRebuildMode::TrimElements + pub(crate) fn trim_elements(&self, start: usize, end: usize) -> VortexResult { let adjusted_offsets = match_each_integer_ptype!(self.offsets().dtype().as_ptype(), |O| { let offset = ::from_usize(start) .vortex_expect("unable to convert the min offset `start` into a `usize`"); diff --git a/vortex-array/src/arrow/executor/list_view.rs b/vortex-array/src/arrow/executor/list_view.rs index 5e5cffd5a7b..eadfc2d8585 100644 --- a/vortex-array/src/arrow/executor/list_view.rs +++ b/vortex-array/src/arrow/executor/list_view.rs @@ -37,12 +37,19 @@ pub(super) fn to_arrow_list_view( // any compute pass over that buffer wastes work on data nothing references. let array = if array.is_zero_copy_to_list() { // A zero-copy-to-list array has no overlaps and no interior gaps, so the only unreferenced - // elements are leading and trailing. Trimming them is much cheaper than a full rebuild, so - // detect the waste in O(1) and trim only when it is significant. - if array.prop_tail_unreferenced(ctx)? > DEFAULT_TRIM_WASTE_THRESHOLD { - array.rebuild(ListViewRebuildMode::TrimElements)? - } else { + // elements are leading and trailing. Trimming them is much cheaper than a full rebuild. + // Compute the referenced bounds once and reuse them for both the decision and the trim. + let n_elts = array.elements().len(); + if n_elts == 0 || array.is_empty() { array + } else { + let (start, end) = array.referenced_element_bounds(ctx)?; + let waste = (n_elts - (end - start)) as f32 / n_elts as f32; + if waste > DEFAULT_TRIM_WASTE_THRESHOLD { + array.trim_elements(start, end)? + } else { + array + } } } else if array.upper_bound_density(ctx)? < DEFAULT_REBUILD_DENSITY_THRESHOLD { // Overlaps, gaps, or garbage may be present, so a full rebuild is needed to reclaim waste. From 64cebb1448f018265392036fd585f6d6bf0883fb Mon Sep 17 00:00:00 2001 From: Matt Katz Date: Fri, 29 May 2026 13:49:09 -0400 Subject: [PATCH 4/6] drop prop_tail_unreferenced, compute waste inline Signed-off-by: Matt Katz Signed-off-by: Matt Katz --- vortex-array/src/arrays/listview/array.rs | 20 ---------------- .../src/arrays/listview/tests/density.rs | 24 +++++++------------ 2 files changed, 8 insertions(+), 36 deletions(-) diff --git a/vortex-array/src/arrays/listview/array.rs b/vortex-array/src/arrays/listview/array.rs index 33928b99a52..f117b290575 100644 --- a/vortex-array/src/arrays/listview/array.rs +++ b/vortex-array/src/arrays/listview/array.rs @@ -576,26 +576,6 @@ pub trait ListViewArrayExt: TypedArrayRef { Ok((start, end)) } - - /// Proportion of `elements` that is unreferenced leading or trailing slack, in `[0.0, 1.0]`. - /// - /// This is the fraction of the `elements` buffer that a - /// [`TrimElements`](super::ListViewRebuildMode::TrimElements) rebuild would reclaim. It is - /// exact and `O(1)` for zero-copy-to-list arrays and otherwise computes min/max statistics - /// over the offsets (see [`referenced_element_bounds`](Self::referenced_element_bounds)). - /// - /// Returns `0.0` when `elements` is empty or the array has no lists. - fn prop_tail_unreferenced(&self, ctx: &mut ExecutionCtx) -> VortexResult { - let n_elts = self.elements().len(); - if n_elts == 0 || self.as_ref().is_empty() { - return Ok(0.0); - } - - let (start, end) = self.referenced_element_bounds(ctx)?; - let referenced = end - start; - - Ok((n_elts - referenced) as f32 / n_elts as f32) - } } impl> ListViewArrayExt for T {} diff --git a/vortex-array/src/arrays/listview/tests/density.rs b/vortex-array/src/arrays/listview/tests/density.rs index 75648af0303..54e3b8fbfdb 100644 --- a/vortex-array/src/arrays/listview/tests/density.rs +++ b/vortex-array/src/arrays/listview/tests/density.rs @@ -143,18 +143,18 @@ fn referenced_mask_set_bits_match_views() -> VortexResult<()> { } #[test] -fn tail_unreferenced_zero_when_fully_referenced() -> VortexResult<()> { +fn referenced_bounds_zero_copy_full_coverage() -> VortexResult<()> { let mut ctx = test_execution_ctx(); // create_basic_listview references all 10 elements with no leading/trailing slack. let lv = create_basic_listview(); - assert!(lv.prop_tail_unreferenced(&mut ctx)?.abs() < EPS); + assert_eq!(lv.referenced_element_bounds(&mut ctx)?, (0, 10)); Ok(()) } #[test] -fn tail_unreferenced_counts_leading_and_trailing_zero_copy() -> VortexResult<()> { +fn referenced_bounds_zero_copy_with_slack() -> VortexResult<()> { let mut ctx = test_execution_ctx(); - // 10 elements, views cover [2, 6): 2 leading + 4 trailing unreferenced -> 0.6. + // 10 elements, views cover [2, 6): 2 leading + 4 trailing unreferenced. let elements = buffer![0i32, 1, 2, 3, 4, 5, 6, 7, 8, 9].into_array(); let offsets = buffer![2u32, 4].into_array(); let sizes = buffer![2u32, 2].into_array(); @@ -162,28 +162,20 @@ fn tail_unreferenced_counts_leading_and_trailing_zero_copy() -> VortexResult<()> ListViewArray::new_unchecked(elements, offsets, sizes, Validity::NonNullable) .with_zero_copy_to_list(true) }; - assert!((lv.prop_tail_unreferenced(&mut ctx)? - 0.6).abs() < EPS); + assert_eq!(lv.referenced_element_bounds(&mut ctx)?, (2, 6)); Ok(()) } #[test] -fn tail_unreferenced_non_zero_copy_uses_stats() -> VortexResult<()> { +fn referenced_bounds_non_zero_copy_uses_stats() -> VortexResult<()> { let mut ctx = test_execution_ctx(); // Out-of-order offsets -> not zero-copy-to-list, so bounds come from min/max stats. - // 10 elements, views [5,7) and [2,4): min offset 2, max end 7 -> referenced [2, 7) -> 0.5. + // Views [5, 7) and [2, 4): min offset 2, max end 7 -> referenced [2, 7). let elements = buffer![0i32, 1, 2, 3, 4, 5, 6, 7, 8, 9].into_array(); let offsets = buffer![5u32, 2].into_array(); let sizes = buffer![2u32, 2].into_array(); let lv = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable); assert!(!lv.is_zero_copy_to_list()); - assert!((lv.prop_tail_unreferenced(&mut ctx)? - 0.5).abs() < EPS); - Ok(()) -} - -#[test] -fn tail_unreferenced_zero_for_empty_elements() -> VortexResult<()> { - let mut ctx = test_execution_ctx(); - let lv = create_empty_elements_listview(); - assert_eq!(lv.prop_tail_unreferenced(&mut ctx)?, 0.0); + assert_eq!(lv.referenced_element_bounds(&mut ctx)?, (2, 7)); Ok(()) } From aee55aa3be0107fc868f2f996aabcb84dc6a59f2 Mon Sep 17 00:00:00 2001 From: Matt Katz Date: Fri, 29 May 2026 15:10:13 -0400 Subject: [PATCH 5/6] fixes Signed-off-by: Matt Katz --- vortex-array/src/arrays/listview/array.rs | 2 +- vortex-array/src/arrays/listview/tests/density.rs | 4 +--- vortex-array/src/arrow/executor/list_view.rs | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/vortex-array/src/arrays/listview/array.rs b/vortex-array/src/arrays/listview/array.rs index f117b290575..e5b6f9aaf99 100644 --- a/vortex-array/src/arrays/listview/array.rs +++ b/vortex-array/src/arrays/listview/array.rs @@ -541,7 +541,7 @@ pub trait ListViewArrayExt: TypedArrayRef { /// The array must contain at least one list (`len() > 0`). fn referenced_element_bounds(&self, ctx: &mut ExecutionCtx) -> VortexResult<(usize, usize)> { let n_lists = self.as_ref().len(); - assert!( + vortex_ensure!( n_lists > 0, "referenced_element_bounds requires a non-empty array" ); diff --git a/vortex-array/src/arrays/listview/tests/density.rs b/vortex-array/src/arrays/listview/tests/density.rs index 54e3b8fbfdb..ad408f650b3 100644 --- a/vortex-array/src/arrays/listview/tests/density.rs +++ b/vortex-array/src/arrays/listview/tests/density.rs @@ -167,10 +167,8 @@ fn referenced_bounds_zero_copy_with_slack() -> VortexResult<()> { } #[test] -fn referenced_bounds_non_zero_copy_uses_stats() -> VortexResult<()> { +fn referenced_bounds_non_zero_copy() -> VortexResult<()> { let mut ctx = test_execution_ctx(); - // Out-of-order offsets -> not zero-copy-to-list, so bounds come from min/max stats. - // Views [5, 7) and [2, 4): min offset 2, max end 7 -> referenced [2, 7). let elements = buffer![0i32, 1, 2, 3, 4, 5, 6, 7, 8, 9].into_array(); let offsets = buffer![5u32, 2].into_array(); let sizes = buffer![2u32, 2].into_array(); diff --git a/vortex-array/src/arrow/executor/list_view.rs b/vortex-array/src/arrow/executor/list_view.rs index eadfc2d8585..637c5e4a77b 100644 --- a/vortex-array/src/arrow/executor/list_view.rs +++ b/vortex-array/src/arrow/executor/list_view.rs @@ -36,7 +36,7 @@ pub(super) fn to_arrow_list_view( // consumers hold an elements buffer containing unreferenced data in memory indefinitely, and // any compute pass over that buffer wastes work on data nothing references. let array = if array.is_zero_copy_to_list() { - // A zero-copy-to-list array has no overlaps and no interior gaps, so the only unreferenced + // A zctl array has no overlaps and no interior gaps, so the only unreferenced // elements are leading and trailing. Trimming them is much cheaper than a full rebuild. // Compute the referenced bounds once and reuse them for both the decision and the trim. let n_elts = array.elements().len(); From e24a01460ded097691d863c492b5c07c6ec6c033 Mon Sep 17 00:00:00 2001 From: Matt Katz Date: Fri, 29 May 2026 15:26:54 -0400 Subject: [PATCH 6/6] fixes Signed-off-by: Matt Katz --- vortex-array/src/arrays/listview/mod.rs | 2 +- vortex-array/src/arrays/listview/rebuild.rs | 21 ++++++++++--------- vortex-array/src/arrow/executor/list_view.rs | 7 ++++--- vortex-duckdb/src/exporter/list_view.rs | 22 ++++++++++++++++++-- 4 files changed, 36 insertions(+), 16 deletions(-) diff --git a/vortex-array/src/arrays/listview/mod.rs b/vortex-array/src/arrays/listview/mod.rs index 526dba396c1..d5674372b2c 100644 --- a/vortex-array/src/arrays/listview/mod.rs +++ b/vortex-array/src/arrays/listview/mod.rs @@ -19,7 +19,7 @@ pub use conversion::recursive_list_from_list_view; mod rebuild; pub use rebuild::DEFAULT_REBUILD_DENSITY_THRESHOLD; -pub use rebuild::DEFAULT_TRIM_WASTE_THRESHOLD; +pub use rebuild::DEFAULT_TRIM_ELEMENTS_THRESHOLD; pub use rebuild::ListViewRebuildMode; #[cfg(test)] diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs index 841470e02f9..2bcb4ef0c1f 100644 --- a/vortex-array/src/arrays/listview/rebuild.rs +++ b/vortex-array/src/arrays/listview/rebuild.rs @@ -60,7 +60,7 @@ pub const DEFAULT_REBUILD_DENSITY_THRESHOLD: f32 = 0.1; /// a more aggressive threshold than [`DEFAULT_REBUILD_DENSITY_THRESHOLD`]. /// /// When the unreferenced (leading + trailing) fraction of `elements` exceeds this threshold, we trim. -pub const DEFAULT_TRIM_WASTE_THRESHOLD: f32 = 0.05; +pub const DEFAULT_TRIM_ELEMENTS_THRESHOLD: f32 = 0.05; /// Modes for rebuilding a [`ListViewArray`]. pub enum ListViewRebuildMode { @@ -346,23 +346,24 @@ impl ListViewArray { /// Rebuilds a [`ListViewArray`] by trimming any unused / unreferenced leading and trailing /// elements, which is defined as a contiguous run of values in the `elements` array that are - /// not referecened by any views in the corresponding [`ListViewArray`]. + /// not referenced by any views in the corresponding [`ListViewArray`]. fn rebuild_trim_elements(&self) -> VortexResult { let mut ctx = LEGACY_SESSION.create_execution_ctx(); let (start, end) = self.referenced_element_bounds(&mut ctx)?; - self.trim_elements(start, end) + + // SAFETY: we calculated valid start and end bounds + unsafe { self.trim_elements(start, end) } } - /// Trims `elements` to the referenced half-open range `[start, end)`, adjusting every offset + /// Unsafely trims `elements` to the referenced half-open range `[start, end)`, adjusting every offset /// down by `start`. The result preserves the original `is_zero_copy_to_list` flag. /// - /// `start` and `end` MUST be the referenced-element bounds from - /// [`referenced_element_bounds`](ListViewArrayExt::referenced_element_bounds); any other range - /// yields a logically corrupt array. Prefer `rebuild(`[`TrimElements`]`)` unless the bounds are - /// already known, in which case this entry point avoids recomputing them. + /// # SAFETY /// - /// [`TrimElements`]: ListViewRebuildMode::TrimElements - pub(crate) fn trim_elements(&self, start: usize, end: usize) -> VortexResult { + /// `start` must be the minimum value of `offsets`, and end should be the maximum value of `offsets[i] + size[i]` + /// over all indices `i` of offsets. Otherwise, `offsets` and `sizes` may hold references to elements that no longer exist + /// and the array will be corrupted. + pub unsafe fn trim_elements(&self, start: usize, end: usize) -> VortexResult { let adjusted_offsets = match_each_integer_ptype!(self.offsets().dtype().as_ptype(), |O| { let offset = ::from_usize(start) .vortex_expect("unable to convert the min offset `start` into a `usize`"); diff --git a/vortex-array/src/arrow/executor/list_view.rs b/vortex-array/src/arrow/executor/list_view.rs index 637c5e4a77b..b77d079f509 100644 --- a/vortex-array/src/arrow/executor/list_view.rs +++ b/vortex-array/src/arrow/executor/list_view.rs @@ -14,7 +14,7 @@ use crate::ExecutionCtx; use crate::arrays::ListViewArray; use crate::arrays::PrimitiveArray; use crate::arrays::listview::DEFAULT_REBUILD_DENSITY_THRESHOLD; -use crate::arrays::listview::DEFAULT_TRIM_WASTE_THRESHOLD; +use crate::arrays::listview::DEFAULT_TRIM_ELEMENTS_THRESHOLD; use crate::arrays::listview::ListViewArrayExt; use crate::arrays::listview::ListViewDataParts; use crate::arrays::listview::ListViewRebuildMode; @@ -45,8 +45,9 @@ pub(super) fn to_arrow_list_view( } else { let (start, end) = array.referenced_element_bounds(ctx)?; let waste = (n_elts - (end - start)) as f32 / n_elts as f32; - if waste > DEFAULT_TRIM_WASTE_THRESHOLD { - array.trim_elements(start, end)? + if waste > DEFAULT_TRIM_ELEMENTS_THRESHOLD { + // SAFETY: we calculated valid start and end bounds + unsafe { array.trim_elements(start, end)? } } else { array } diff --git a/vortex-duckdb/src/exporter/list_view.rs b/vortex-duckdb/src/exporter/list_view.rs index 816294bb923..a0ee1614001 100644 --- a/vortex-duckdb/src/exporter/list_view.rs +++ b/vortex-duckdb/src/exporter/list_view.rs @@ -11,6 +11,7 @@ use vortex::array::ExecutionCtx; use vortex::array::arrays::ListViewArray; use vortex::array::arrays::PrimitiveArray; use vortex::array::arrays::listview::DEFAULT_REBUILD_DENSITY_THRESHOLD; +use vortex::array::arrays::listview::DEFAULT_TRIM_ELEMENTS_THRESHOLD; use vortex::array::arrays::listview::ListViewArrayExt; use vortex::array::arrays::listview::ListViewDataParts; use vortex::array::arrays::listview::ListViewRebuildMode; @@ -55,8 +56,25 @@ pub(crate) fn new_exporter( // If the array is sufficiently sparse, rebuild. Otherwise the DuckDB vector will // hold an elements buffer containing unreferenced data in memory indefinitely, // and any compute pass over that buffer wastes work on data nothing references. - let density = array.upper_bound_density(ctx)?; - let array = if density < DEFAULT_REBUILD_DENSITY_THRESHOLD { + let array = if array.is_zero_copy_to_list() { + // A zctl array has no overlaps and no interior gaps, so the only unreferenced + // elements are leading and trailing. Trimming them is much cheaper than a full rebuild. + // Compute the referenced bounds once and reuse them for both the decision and the trim. + let n_elts = array.elements().len(); + if n_elts == 0 || array.is_empty() { + array + } else { + let (start, end) = array.referenced_element_bounds(ctx)?; + let waste = (n_elts - (end - start)) as f32 / n_elts as f32; + if waste > DEFAULT_TRIM_ELEMENTS_THRESHOLD { + // SAFETY: we calculated valid start and end bounds + unsafe { array.trim_elements(start, end)? } + } else { + array + } + } + } else if array.upper_bound_density(ctx)? < DEFAULT_REBUILD_DENSITY_THRESHOLD { + // Overlaps, gaps, or garbage may be present, so a full rebuild is needed to reclaim waste. array.rebuild(ListViewRebuildMode::MakeZeroCopyToList)? } else { array