From 50b0e93321f5084410bbebc7b0c896c671a2c298 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 28 May 2026 22:53:28 +0000 Subject: [PATCH 1/6] Speed up VarBinView build_views single-buffer canonicalization build_views is the hot loop that turns a decoded string heap plus per-element lengths into VarBinView views. It is shared by FSST, VarBin, zstd and dict canonicalization. The previous implementation checked for a 2 GiB buffer rollover on every element and called the out-of-line (#[inline(never)]) BinaryView::make_view per element. Add a fast path for the common case where the whole decoded heap fits in a single buffer (bytes.len() <= max_buffer_len, which bounds every offset). In that path there can be no rollover, so the loop drops the per-element rollover branch and builds the reference-view variant inline, avoiding the make_view call for long (>12 byte) values. Output is byte-for-byte identical to the previous code; the multi-buffer path is unchanged. Signed-off-by: Joe Isaacs --- .../src/arrays/varbinview/build_views.rs | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/vortex-array/src/arrays/varbinview/build_views.rs b/vortex-array/src/arrays/varbinview/build_views.rs index 87f5c84706b..e7c753c068a 100644 --- a/vortex-array/src/arrays/varbinview/build_views.rs +++ b/vortex-array/src/arrays/varbinview/build_views.rs @@ -9,6 +9,7 @@ use vortex_buffer::ByteBuffer; use vortex_buffer::ByteBufferMut; pub use crate::arrays::varbinview::BinaryView; +use crate::arrays::varbinview::Ref; use crate::dtype::NativePType; /// Convert an offsets buffer to a buffer of element lengths. @@ -33,6 +34,43 @@ pub fn build_views>( ) -> (Vec, Buffer) { let mut views = BufferMut::::with_capacity(lens.len()); + // Common case: the whole decoded heap fits within a single buffer, so no rollover can occur + // (`bytes.len()` is the total decoded size and therefore an upper bound on every offset). This + // lets the hot loop drop the per-element rollover branch and construct reference views inline, + // avoiding the out-of-line `BinaryView::make_view` call for the common long-string case. + if bytes.len() <= max_buffer_len { + let data = bytes.as_slice(); + let mut offset = 0usize; + for &len in lens { + let len = len.as_(); + let value = &data[offset..offset + len]; + let view = if len > BinaryView::MAX_INLINED_SIZE { + let mut prefix = [0u8; 4]; + prefix.copy_from_slice(&value[..4]); + BinaryView { + _ref: Ref { + size: len.as_(), + prefix, + buffer_index: start_buf_index, + offset: offset.as_(), + }, + } + } else { + BinaryView::make_view(value, start_buf_index, offset.as_()) + }; + // SAFETY: we reserved capacity for exactly `lens.len()` views above. + unsafe { views.push_unchecked(view) }; + offset += len; + } + + let buffers = if bytes.is_empty() { + Vec::new() + } else { + vec![bytes.freeze()] + }; + return (buffers, views.freeze()); + } + let mut buffers = Vec::new(); let mut buf_index = start_buf_index; From 76e17aa2f5c39c87a09a1024bfbc294adf08f6d5 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 29 May 2026 11:50:57 +0000 Subject: [PATCH 2/6] Optimize build_views single-buffer hot loop write path Builds on the single-buffer fast path with two changes to the per-view write, measured at ~12% faster on the long-string canonicalization path (8.13 -> 7.10 ns/view, isolated build_views over 600k reference views, native/AVX-512, pinned core, 6 samples each): 1. Write views into the reserved spare capacity via an iterator instead of `push_unchecked`. `push_unchecked` advances the backing buffer's length on every call, which the optimizer cannot prove is loop-invariant, so it reloads and rewrites the output cursor through the stack each iteration. Writing into the spare slice and calling `set_len` once keeps the cursor in a register. 2. Assemble the reference view as a single `u128` via a new `BinaryView::new_ref` constructor, so the compiler emits one wide store per view rather than separate field stores. The constructor matches the little-endian field order already assumed by `BinaryView` (`le_bytes`, `From`, `as_u128`). A new `new_ref_matches_make_view` test asserts the `u128` assembly is byte-identical to the value-inspecting `make_view` across several lengths. Existing build_views and vortex-fsst canonicalization tests pass; clippy and rustfmt are clean. Signed-off-by: Joe Isaacs --- .../src/arrays/varbinview/build_views.rs | 24 +++++----- vortex-array/src/arrays/varbinview/view.rs | 45 +++++++++++++++++++ 2 files changed, 57 insertions(+), 12 deletions(-) diff --git a/vortex-array/src/arrays/varbinview/build_views.rs b/vortex-array/src/arrays/varbinview/build_views.rs index e7c753c068a..6ef87111610 100644 --- a/vortex-array/src/arrays/varbinview/build_views.rs +++ b/vortex-array/src/arrays/varbinview/build_views.rs @@ -9,7 +9,6 @@ use vortex_buffer::ByteBuffer; use vortex_buffer::ByteBufferMut; pub use crate::arrays::varbinview::BinaryView; -use crate::arrays::varbinview::Ref; use crate::dtype::NativePType; /// Convert an offsets buffer to a buffer of element lengths. @@ -41,27 +40,28 @@ pub fn build_views>( if bytes.len() <= max_buffer_len { let data = bytes.as_slice(); let mut offset = 0usize; - for &len in lens { + // Write directly into the reserved spare capacity rather than `push_unchecked`. The latter + // advances the backing buffer's length on every call, which the optimizer cannot prove is + // loop-invariant, so it reloads and rewrites the output cursor through the stack each + // iteration. Writing into the spare slice keeps the cursor in a register and the length is + // set once after the loop. + let spare = views.spare_capacity_mut(); + for (slot, &len) in spare.iter_mut().zip(lens) { let len = len.as_(); let value = &data[offset..offset + len]; let view = if len > BinaryView::MAX_INLINED_SIZE { let mut prefix = [0u8; 4]; prefix.copy_from_slice(&value[..4]); - BinaryView { - _ref: Ref { - size: len.as_(), - prefix, - buffer_index: start_buf_index, - offset: offset.as_(), - }, - } + BinaryView::new_ref(len.as_(), prefix, start_buf_index, offset.as_()) } else { BinaryView::make_view(value, start_buf_index, offset.as_()) }; - // SAFETY: we reserved capacity for exactly `lens.len()` views above. - unsafe { views.push_unchecked(view) }; + slot.write(view); offset += len; } + // SAFETY: the loop initialized exactly `lens.len()` contiguous views (`spare` has at least + // `lens.len()` slots, and `zip` stops at the shorter operand). + unsafe { views.set_len(lens.len()) }; let buffers = if bytes.is_empty() { Vec::new() diff --git a/vortex-array/src/arrays/varbinview/view.rs b/vortex-array/src/arrays/varbinview/view.rs index 38cbd2798c6..2bee3b2a4df 100644 --- a/vortex-array/src/arrays/varbinview/view.rs +++ b/vortex-array/src/arrays/varbinview/view.rs @@ -170,6 +170,27 @@ impl BinaryView { Self { le_bytes: [0; 16] } } + /// Create a reference view directly from its components, without inspecting the value. + /// + /// `size` must be greater than [`MAX_INLINED_SIZE`], and `prefix` must hold the first four + /// bytes of the value. This is the fast path for bulk view construction where the caller has + /// already established that the value is too long to inline; it assembles the 16-byte view as a + /// single `u128` so the compiler can emit one wide store per view. + /// + /// [`MAX_INLINED_SIZE`]: Self::MAX_INLINED_SIZE + #[inline] + pub fn new_ref(size: u32, prefix: [u8; 4], buffer_index: u32, offset: u32) -> Self { + debug_assert!(size as usize > Self::MAX_INLINED_SIZE); + // Matches the little-endian field order of `Ref` (size, prefix, buffer_index, offset), + // consistent with `le_bytes` and the `From`/`as_u128` representation. + Self::from( + u128::from(size) + | (u128::from(u32::from_le_bytes(prefix)) << 32) + | (u128::from(buffer_index) << 64) + | (u128::from(offset) << 96), + ) + } + /// Create a new inlined binary view /// /// # Panics @@ -278,3 +299,27 @@ impl fmt::Debug for BinaryView { s.finish() } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn new_ref_matches_make_view() { + // `new_ref` assembles the reference view as a `u128`; it must be byte-identical to the + // value-inspecting `make_view` for any value longer than the inline limit. + for &len in &[13usize, 20, 255, 4096] { + let value: Vec = (0..len).map(|i| (i % 251) as u8).collect(); + let prefix = [value[0], value[1], value[2], value[3]]; + let made = BinaryView::make_view(&value, 7, 42); + let built = BinaryView::new_ref(len as u32, prefix, 7, 42); + assert_eq!(made.as_u128(), built.as_u128(), "mismatch at len {len}"); + assert!(!built.is_inlined()); + let r = built.as_view(); + assert_eq!(r.size, len as u32); + assert_eq!(r.prefix, prefix); + assert_eq!(r.buffer_index, 7); + assert_eq!(r.offset, 42); + } + } +} From 84159afd23771ddab30032b919aa57034f74c6f9 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 29 May 2026 16:13:03 +0000 Subject: [PATCH 3/6] test: cover build_views single-buffer fast path Add tests exercising the new single-buffer fast path in build_views: a parameterized roundtrip over mixed inline/reference lengths, the 12/13 byte inline boundary, empty and interleaved-empty values, all-inlined and all-reference sets, and a single large value. Also verify the fast path is taken at the exact bytes.len() == max_buffer_len boundary, that it agrees with the slow rollover path on reconstructed values, handles empty input, and produces views byte-identical to make_view. Allow cast_possible_truncation in the view.rs test module so the new_ref test compiles under clippy. Signed-off-by: Joe Isaacs --- .../src/arrays/varbinview/build_views.rs | 155 ++++++++++++++++++ vortex-array/src/arrays/varbinview/view.rs | 4 + 2 files changed, 159 insertions(+) diff --git a/vortex-array/src/arrays/varbinview/build_views.rs b/vortex-array/src/arrays/varbinview/build_views.rs index 6ef87111610..ac540895f6a 100644 --- a/vortex-array/src/arrays/varbinview/build_views.rs +++ b/vortex-array/src/arrays/varbinview/build_views.rs @@ -104,12 +104,167 @@ pub fn build_views>( #[cfg(test)] mod tests { + use rstest::rstest; use vortex_buffer::ByteBuffer; use vortex_buffer::ByteBufferMut; use crate::arrays::varbinview::BinaryView; use crate::arrays::varbinview::build_views::build_views; + /// Concatenate `values` into a single byte heap and return it alongside the per-element lengths, + /// matching the `(bytes, lens)` inputs that `build_views` consumes. + fn flatten(values: &[&[u8]]) -> (ByteBufferMut, Vec) { + let mut bytes = ByteBufferMut::empty(); + let mut lens = Vec::with_capacity(values.len()); + for v in values { + bytes.extend_from_slice(v); + lens.push(u32::try_from(v.len()).unwrap()); + } + (bytes, lens) + } + + /// Reconstruct the logical value behind each view by dereferencing it through the output + /// buffers. The first buffer corresponds to `start_buf_index`, so buffer indices are rebased by + /// that amount. This is the core correctness invariant: regardless of which code path built the + /// views, every view must point back at its original bytes. + fn reconstruct( + buffers: &[ByteBuffer], + views: &[BinaryView], + start_buf_index: u32, + ) -> Vec> { + views + .iter() + .map(|view| { + if view.is_inlined() { + view.as_inlined().value().to_vec() + } else { + let r = view.as_view(); + let buf = &buffers[(r.buffer_index - start_buf_index) as usize]; + buf[r.as_range()].to_vec() + } + }) + .collect() + } + + /// The single-buffer fast path (`bytes.len() <= max_buffer_len`) must reproduce every input + /// value exactly, emit a single output buffer holding the untouched heap, and reference only + /// `start_buf_index`. We cover a spread of value sets that mix inlined (<= 12 bytes) and + /// reference (> 12 bytes) lengths, including the 12/13 byte inline boundary, empty values, and a + /// fully-inlined set. + #[rstest] + #[case::mixed(&[b"a".as_slice(), b"this is a long reference value", b"short", b"another long value here!!"])] + #[case::inline_boundary(&[&[b'x'; 12] as &[u8], &[b'y'; 13], &[b'z'; 12], &[b'w'; 13]])] + #[case::all_inlined(&[b"".as_slice(), b"a", b"bb", b"ccc", b"dddddddddddd"])] + #[case::all_reference(&[&[b'a'; 100] as &[u8], &[b'b'; 50], &[b'c'; 4096]])] + #[case::empty_values_interleaved(&[b"".as_slice(), b"a long value that is referenced", b"", b"", b"trailing long reference value"])] + #[case::single_long(&[&[7u8; 1 << 16] as &[u8]])] + fn fast_path_roundtrip(#[case] values: &[&[u8]]) { + let (bytes, lens) = flatten(values); + let total = bytes.len(); + let start_buf_index = 3; + + // `max_buffer_len` strictly greater than the heap forces the single-buffer fast path. + let (buffers, views) = build_views(start_buf_index, total + 1, bytes, &lens); + + assert_eq!(views.len(), values.len()); + if total == 0 { + assert!(buffers.is_empty(), "empty heap must not allocate a buffer"); + } else { + assert_eq!(buffers.len(), 1, "whole heap must stay in one buffer"); + // The fast path freezes the input heap unchanged. + let concatenated: Vec = values.concat(); + assert_eq!(buffers[0].as_slice(), concatenated.as_slice()); + } + for view in views.iter() { + if !view.is_inlined() { + assert_eq!(view.as_view().buffer_index, start_buf_index); + } + } + + let expected: Vec> = values.iter().map(|v| v.to_vec()).collect(); + assert_eq!(reconstruct(&buffers, &views, start_buf_index), expected); + } + + /// The fast path is taken when `bytes.len() <= max_buffer_len`, so equality at the boundary must + /// still produce a single buffer (not roll over to the slow path). + #[test] + fn fast_path_taken_at_exact_boundary() { + let (bytes, lens) = + flatten(&[b"this value is definitely long", b"and so is this one here"]); + let total = bytes.len(); + + let (buffers, views) = build_views(0, total, bytes, &lens); + + assert_eq!( + buffers.len(), + 1, + "len == max_buffer_len must stay on fast path" + ); + assert_eq!(views.len(), 2); + } + + /// For the same logical data, the fast path (single buffer) and the slow rollover path must + /// reconstruct identical values. Driving the slow path with a small `max_buffer_len` forces + /// buffer splitting while leaving the recovered values unchanged. + #[test] + fn fast_and_slow_paths_agree() { + let values: &[&[u8]] = &[ + b"first long reference value", + b"tiny", + b"second long reference value!!", + b"third looooong reference value", + ]; + let expected: Vec> = values.iter().map(|v| v.to_vec()).collect(); + + let (fast_bytes, lens) = flatten(values); + let total = fast_bytes.len(); + let (fast_buffers, fast_views) = build_views(0, total + 1, fast_bytes, &lens); + assert_eq!(fast_buffers.len(), 1); + assert_eq!(reconstruct(&fast_buffers, &fast_views, 0), expected); + + // Force the rollover path: a small cap (>= the longest value) that the total heap exceeds. + let longest = values.iter().map(|v| v.len()).max().unwrap(); + let (slow_bytes, _) = flatten(values); + let (slow_buffers, slow_views) = build_views(0, longest, slow_bytes, &lens); + assert!( + slow_buffers.len() > 1, + "small cap should split into many buffers" + ); + assert_eq!(reconstruct(&slow_buffers, &slow_views, 0), expected); + + // Same logical contents regardless of how the heap was partitioned. + assert_eq!( + reconstruct(&fast_buffers, &fast_views, 0), + reconstruct(&slow_buffers, &slow_views, 0) + ); + } + + /// Empty input must yield no buffers and no views, exercising the `bytes.is_empty()` branch. + #[test] + fn fast_path_empty_input() { + let lens: Vec = Vec::new(); + let (buffers, views) = build_views(0, 1024, ByteBufferMut::empty(), &lens); + assert!(buffers.is_empty()); + assert!(views.is_empty()); + } + + /// The fast path must produce views byte-identical to the value-inspecting `make_view`, which is + /// what the slow path uses. This pins the inline/reference decision and field layout. + #[test] + fn fast_path_matches_make_view() { + let values: &[&[u8]] = &[b"inline", b"this is a long reference value", b""]; + let (bytes, lens) = flatten(values); + let total = bytes.len(); + let (_buffers, views) = build_views(0, total + 1, bytes, &lens); + + let expected = [ + BinaryView::make_view(b"inline", 0, 0), + BinaryView::make_view(b"this is a long reference value", 0, 6), + BinaryView::make_view(b"", 0, 36), + ]; + assert_eq!(views.as_slice(), &expected); + } + #[test] fn test_to_canonical_large() { // We are testing generating views for raw data that should look like diff --git a/vortex-array/src/arrays/varbinview/view.rs b/vortex-array/src/arrays/varbinview/view.rs index 2bee3b2a4df..dd411f07cc4 100644 --- a/vortex-array/src/arrays/varbinview/view.rs +++ b/vortex-array/src/arrays/varbinview/view.rs @@ -301,6 +301,10 @@ impl fmt::Debug for BinaryView { } #[cfg(test)] +#[expect( + clippy::cast_possible_truncation, + reason = "test values are bounded well within the target types" +)] mod tests { use super::*; From a95257e70a8136638a7b72bc9b83c830a0296b35 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 29 May 2026 16:19:15 +0000 Subject: [PATCH 4/6] test: verify build_views fast path large u32 offsets Add fast_path_large_offsets, which builds a ~9 MiB heap so the running offset climbs past 2^23 while staying far below MAX_BUFFER_LEN. Each value encodes its index, and the test asserts every Ref records the exact cumulative offset and size, guarding against any accidental narrowing of the u32 offset/size fields written via as_ truncation. Signed-off-by: Joe Isaacs --- .../src/arrays/varbinview/build_views.rs | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/vortex-array/src/arrays/varbinview/build_views.rs b/vortex-array/src/arrays/varbinview/build_views.rs index ac540895f6a..a5ef8ac2404 100644 --- a/vortex-array/src/arrays/varbinview/build_views.rs +++ b/vortex-array/src/arrays/varbinview/build_views.rs @@ -185,6 +185,42 @@ mod tests { assert_eq!(reconstruct(&buffers, &views, start_buf_index), expected); } + /// Offsets and sizes are written into the `u32` `Ref` fields via `as_` truncation, so we must + /// confirm they stay correct once the running offset grows well past the 16-bit range (i.e. is + /// not narrowed to a smaller width). A ~9 MiB heap pushes offsets above 2^23 while remaining far + /// below `MAX_BUFFER_LEN`; each value encodes its index in its first bytes so a misplaced offset + /// would reconstruct the wrong bytes. + #[test] + fn fast_path_large_offsets() { + const N: usize = 9000; + const LEN: usize = 1000; + // The final offset is (N - 1) * LEN, which must exceed 2^23 to be a meaningful check. + const _: () = assert!((N - 1) * LEN > (1 << 23)); + + let values: Vec> = (0..N) + .map(|i| { + let mut v = vec![0u8; LEN]; + v[..4].copy_from_slice(&u32::try_from(i).unwrap().to_le_bytes()); + v + }) + .collect(); + let refs: Vec<&[u8]> = values.iter().map(|v| v.as_slice()).collect(); + + let (bytes, lens) = flatten(&refs); + let total = bytes.len(); + + let (buffers, views) = build_views(0, total + 1, bytes, &lens); + + assert_eq!(buffers.len(), 1); + // The recorded offset must equal the cumulative byte position, exactly, for every view. + for (i, view) in views.iter().enumerate() { + let r = view.as_view(); + assert_eq!(r.offset as usize, i * LEN, "wrong offset for view {i}"); + assert_eq!(r.size as usize, LEN); + } + assert_eq!(reconstruct(&buffers, &views, 0), values); + } + /// The fast path is taken when `bytes.len() <= max_buffer_len`, so equality at the boundary must /// still produce a single buffer (not roll over to the slow path). #[test] From 10f6f0140c4b755f3acf4af7a4a9fd64eadf6d8b Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 29 May 2026 16:33:46 +0000 Subject: [PATCH 5/6] test: slow gated FSST canonicalize offsets-overflow regression Add fsst_canonicalize_offsets_overflow_i32, the decode-side companion to fsst_compress_offsets_overflow_i32. It builds an FSST array whose decompressed heap crosses i32::MAX so canonicalization feeds build_views a heap larger than MAX_BUFFER_LEN, forcing the single-buffer fast path to be declined in favor of buffer rollover. The test asserts the row count is preserved, that more than one data buffer is produced, that no buffer exceeds MAX_BUFFER_LEN, and that rows straddling the rollover boundary reconstruct exactly. Gated with the same #[test_with::env(CI)] / #[test_with::no_env( VORTEX_SKIP_SLOW_TESTS)] attributes as the existing slow test. Highly compressible bodies keep the FSST array tiny so peak memory is dominated by the input and decompressed heaps. Signed-off-by: Joe Isaacs --- encodings/fsst/src/tests.rs | 90 +++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/encodings/fsst/src/tests.rs b/encodings/fsst/src/tests.rs index c4dca57c61b..dbaec0fee4b 100644 --- a/encodings/fsst/src/tests.rs +++ b/encodings/fsst/src/tests.rs @@ -10,6 +10,7 @@ use vortex_array::LEGACY_SESSION; use vortex_array::VortexSessionExecute; use vortex_array::arrays::VarBinViewArray; use vortex_array::arrays::varbin::builder::VarBinBuilder; +use vortex_array::arrays::varbinview::build_views::MAX_BUFFER_LEN; use vortex_array::assert_arrays_eq; use vortex_array::assert_nth_scalar; use vortex_array::dtype::DType; @@ -168,3 +169,92 @@ fn fsst_compress_offsets_overflow_i32() { let compressed = fsst_compress(array, len, &dtype, &compressor, &mut ctx); assert_eq!(compressed.len(), len); } + +// TODO(someone): ideally CI would run this in release mode as well since debug builds make the +// allocation and decompression loop substantially slower. +/// Decode-side companion to [`fsst_compress_offsets_overflow_i32`]. Canonicalizing an FSST array +/// decompresses the whole heap and feeds it to `build_views` with `max_buffer_len = MAX_BUFFER_LEN` +/// (`i32::MAX`). `build_views` has a single-buffer fast path taken only when the decoded heap fits +/// within one buffer; when the heap exceeds `MAX_BUFFER_LEN` it must instead roll over into multiple +/// buffers (resetting the per-buffer offset) so the `u32` view offsets never overflow. +/// +/// This builds an FSST array whose *decompressed* size crosses `i32::MAX`, so canonicalization is +/// forced down the rollover path. We assert the row count is preserved, that more than one data +/// buffer was produced (proving the fast path was declined), that no single buffer exceeds +/// `MAX_BUFFER_LEN`, and that values on both sides of the rollover boundary reconstruct exactly. +/// +/// Highly compressible bodies keep the FSST array tiny, so peak memory is dominated by the ~2.25 GiB +/// input and the ~2.25 GiB decompressed heap. Gated to CI and skipped when `VORTEX_SKIP_SLOW_TESTS` +/// is set. To run it locally: +/// +/// ```text +/// CI=1 cargo test --release -p vortex-fsst fsst_canonicalize_offsets +/// ``` +#[test_with::env(CI)] +#[test_with::no_env(VORTEX_SKIP_SLOW_TESTS)] +fn fsst_canonicalize_offsets_overflow_i32() { + const STRING_LEN: usize = 64 * 1024; + // Comfortably past MAX_BUFFER_LEN (`i32::MAX` ~= 2.0 GiB) so the decoded heap must roll over. + const TOTAL_BYTES: usize = (1usize << 31) + (256 << 20); // ~2.25 GiB + const N: usize = TOTAL_BYTES / STRING_LEN; + + // Each value is a long, trivially compressible body carrying its row index as an ASCII prefix, + // so FSST compresses the heap to almost nothing while every row stays individually verifiable. + fn nth_string(i: usize) -> Vec { + let mut s = vec![b'x'; STRING_LEN]; + let prefix = format!("row-{i:08}-"); + s[..prefix.len()].copy_from_slice(prefix.as_bytes()); + s + } + + println!("building large VarBinArray"); + let mut builder = VarBinBuilder::::with_capacity(N); + let mut buf = vec![b'x'; STRING_LEN]; + for i in 0..N { + let prefix = format!("row-{i:08}-"); + buf[..prefix.len()].copy_from_slice(prefix.as_bytes()); + builder.append_value(&buf); + } + let array = builder.finish(DType::Utf8(Nullability::NonNullable)); + + println!("training FSST compressor"); + let compressor = fsst_train_compressor(&array); + let len = array.len(); + let dtype = array.dtype().clone(); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + + println!("compressing to FSST"); + let compressed = fsst_compress(array, len, &dtype, &compressor, &mut ctx).into_array(); + + println!("canonicalizing to VarBinView"); + let canonical = compressed.execute::(&mut ctx).unwrap(); + + assert_eq!( + canonical.len(), + N, + "row count must survive canonicalization" + ); + assert!( + canonical.data_buffers().len() >= 2, + "decoded heap exceeding MAX_BUFFER_LEN must roll over into multiple buffers, got {}", + canonical.data_buffers().len() + ); + for (i, b) in canonical.data_buffers().iter().enumerate() { + assert!( + b.as_host().len() <= MAX_BUFFER_LEN, + "buffer {i} of {} bytes exceeds MAX_BUFFER_LEN", + b.as_host().len() + ); + } + + // Spot-check the endpoints and the rows straddling the rollover boundary, which is the first + // place the second buffer's offsets restart from zero. + let boundary = MAX_BUFFER_LEN / STRING_LEN; + for i in [0, boundary - 1, boundary, boundary + 1, N / 2, N - 1] { + assert_eq!( + canonical.bytes_at(i).as_slice(), + nth_string(i).as_slice(), + "value mismatch at row {i}" + ); + } +} From fc6b83999dd626f6fb0dd67a1174d2863c37a4eb Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 29 May 2026 16:46:40 +0000 Subject: [PATCH 6/6] test: replace FSST overflow test with direct build_views rollover test Drop the FSST-based fsst_canonicalize_offsets_overflow_i32 test in favor of a direct build_views test in vortex-array, avoiding the compressor/training and the doubled input+decompressed heaps. build_views_offsets_overflow_i32 builds a ~2.25 GiB heap (just past i32::MAX) and calls build_views with MAX_BUFFER_LEN, asserting the heap rolls over into multiple buffers, that no buffer exceeds MAX_BUFFER_LEN, and that rows straddling the rollover boundary reconstruct exactly. Validated that the test catches the overflow: with the single-buffer fast-path guard intact it passes, and forcing the fast path to swallow the whole heap makes it fail on the multi-buffer assertion. Gated with #[test_with::env(CI)] / #[test_with::no_env(VORTEX_SKIP_SLOW_TESTS)], matching the existing slow-test convention; adds test-with as a vortex-array dev-dependency. Signed-off-by: Joe Isaacs --- Cargo.lock | 1 + encodings/fsst/src/tests.rs | 90 ------------------- vortex-array/Cargo.toml | 1 + .../src/arrays/varbinview/build_views.rs | 72 +++++++++++++++ 4 files changed, 74 insertions(+), 90 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 045c72176fd..64afffd997d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9195,6 +9195,7 @@ dependencies = [ "static_assertions", "tabled", "termtree", + "test-with", "tracing", "uuid", "vortex-array", diff --git a/encodings/fsst/src/tests.rs b/encodings/fsst/src/tests.rs index dbaec0fee4b..c4dca57c61b 100644 --- a/encodings/fsst/src/tests.rs +++ b/encodings/fsst/src/tests.rs @@ -10,7 +10,6 @@ use vortex_array::LEGACY_SESSION; use vortex_array::VortexSessionExecute; use vortex_array::arrays::VarBinViewArray; use vortex_array::arrays::varbin::builder::VarBinBuilder; -use vortex_array::arrays::varbinview::build_views::MAX_BUFFER_LEN; use vortex_array::assert_arrays_eq; use vortex_array::assert_nth_scalar; use vortex_array::dtype::DType; @@ -169,92 +168,3 @@ fn fsst_compress_offsets_overflow_i32() { let compressed = fsst_compress(array, len, &dtype, &compressor, &mut ctx); assert_eq!(compressed.len(), len); } - -// TODO(someone): ideally CI would run this in release mode as well since debug builds make the -// allocation and decompression loop substantially slower. -/// Decode-side companion to [`fsst_compress_offsets_overflow_i32`]. Canonicalizing an FSST array -/// decompresses the whole heap and feeds it to `build_views` with `max_buffer_len = MAX_BUFFER_LEN` -/// (`i32::MAX`). `build_views` has a single-buffer fast path taken only when the decoded heap fits -/// within one buffer; when the heap exceeds `MAX_BUFFER_LEN` it must instead roll over into multiple -/// buffers (resetting the per-buffer offset) so the `u32` view offsets never overflow. -/// -/// This builds an FSST array whose *decompressed* size crosses `i32::MAX`, so canonicalization is -/// forced down the rollover path. We assert the row count is preserved, that more than one data -/// buffer was produced (proving the fast path was declined), that no single buffer exceeds -/// `MAX_BUFFER_LEN`, and that values on both sides of the rollover boundary reconstruct exactly. -/// -/// Highly compressible bodies keep the FSST array tiny, so peak memory is dominated by the ~2.25 GiB -/// input and the ~2.25 GiB decompressed heap. Gated to CI and skipped when `VORTEX_SKIP_SLOW_TESTS` -/// is set. To run it locally: -/// -/// ```text -/// CI=1 cargo test --release -p vortex-fsst fsst_canonicalize_offsets -/// ``` -#[test_with::env(CI)] -#[test_with::no_env(VORTEX_SKIP_SLOW_TESTS)] -fn fsst_canonicalize_offsets_overflow_i32() { - const STRING_LEN: usize = 64 * 1024; - // Comfortably past MAX_BUFFER_LEN (`i32::MAX` ~= 2.0 GiB) so the decoded heap must roll over. - const TOTAL_BYTES: usize = (1usize << 31) + (256 << 20); // ~2.25 GiB - const N: usize = TOTAL_BYTES / STRING_LEN; - - // Each value is a long, trivially compressible body carrying its row index as an ASCII prefix, - // so FSST compresses the heap to almost nothing while every row stays individually verifiable. - fn nth_string(i: usize) -> Vec { - let mut s = vec![b'x'; STRING_LEN]; - let prefix = format!("row-{i:08}-"); - s[..prefix.len()].copy_from_slice(prefix.as_bytes()); - s - } - - println!("building large VarBinArray"); - let mut builder = VarBinBuilder::::with_capacity(N); - let mut buf = vec![b'x'; STRING_LEN]; - for i in 0..N { - let prefix = format!("row-{i:08}-"); - buf[..prefix.len()].copy_from_slice(prefix.as_bytes()); - builder.append_value(&buf); - } - let array = builder.finish(DType::Utf8(Nullability::NonNullable)); - - println!("training FSST compressor"); - let compressor = fsst_train_compressor(&array); - let len = array.len(); - let dtype = array.dtype().clone(); - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - - println!("compressing to FSST"); - let compressed = fsst_compress(array, len, &dtype, &compressor, &mut ctx).into_array(); - - println!("canonicalizing to VarBinView"); - let canonical = compressed.execute::(&mut ctx).unwrap(); - - assert_eq!( - canonical.len(), - N, - "row count must survive canonicalization" - ); - assert!( - canonical.data_buffers().len() >= 2, - "decoded heap exceeding MAX_BUFFER_LEN must roll over into multiple buffers, got {}", - canonical.data_buffers().len() - ); - for (i, b) in canonical.data_buffers().iter().enumerate() { - assert!( - b.as_host().len() <= MAX_BUFFER_LEN, - "buffer {i} of {} bytes exceeds MAX_BUFFER_LEN", - b.as_host().len() - ); - } - - // Spot-check the endpoints and the rows straddling the rollover boundary, which is the first - // place the second buffer's offsets restart from zero. - let boundary = MAX_BUFFER_LEN / STRING_LEN; - for i in [0, boundary - 1, boundary, boundary + 1, N / 2, N - 1] { - assert_eq!( - canonical.bytes_at(i).as_slice(), - nth_string(i).as_slice(), - "value mismatch at row {i}" - ); - } -} diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml index 666a23c02c4..76e119ae366 100644 --- a/vortex-array/Cargo.toml +++ b/vortex-array/Cargo.toml @@ -95,6 +95,7 @@ rand_distr = { workspace = true } rstest = { workspace = true } serde_json = { workspace = true } serde_test = { workspace = true } +test-with = { workspace = true } vortex-array = { path = ".", features = ["_test-harness", "table-display"] } [[bench]] diff --git a/vortex-array/src/arrays/varbinview/build_views.rs b/vortex-array/src/arrays/varbinview/build_views.rs index a5ef8ac2404..a13cd62bf78 100644 --- a/vortex-array/src/arrays/varbinview/build_views.rs +++ b/vortex-array/src/arrays/varbinview/build_views.rs @@ -109,6 +109,7 @@ mod tests { use vortex_buffer::ByteBufferMut; use crate::arrays::varbinview::BinaryView; + use crate::arrays::varbinview::build_views::MAX_BUFFER_LEN; use crate::arrays::varbinview::build_views::build_views; /// Concatenate `values` into a single byte heap and return it alongside the per-element lengths, @@ -301,6 +302,77 @@ mod tests { assert_eq!(views.as_slice(), &expected); } + // TODO(someone): ideally CI would run this in release mode as well, since debug builds make the + // ~2.25 GiB allocation and fill loop substantially slower. + /// Slow regression for the single-buffer fast-path guard. The fast path is only valid when the + /// whole heap fits in one buffer (`bytes.len() <= max_buffer_len`); once the heap exceeds + /// [`MAX_BUFFER_LEN`] (`i32::MAX`, ~2.0 GiB) `build_views` must roll the heap into multiple + /// buffers, resetting the per-buffer offset, so no view references an offset past the + /// `i32`-bounded buffer limit. + /// + /// We build a heap just past `i32::MAX` and assert it rolls over into more than one buffer, that + /// no buffer exceeds `MAX_BUFFER_LEN`, and that values straddling the rollover boundary (where + /// the second buffer's offsets restart from zero) reconstruct exactly. If the guard regressed and + /// the fast path swallowed the whole heap, it would emit a single >2 GiB buffer with offsets past + /// `i32::MAX`, which the buffer-count and buffer-size assertions catch. + /// + /// Allocates ~2.25 GiB, so it is gated to CI and skipped when `VORTEX_SKIP_SLOW_TESTS` is set: + /// + /// ```text + /// CI=1 cargo test --release -p vortex-array build_views_offsets_overflow + /// ``` + /// + /// [`MAX_BUFFER_LEN`]: super::MAX_BUFFER_LEN + #[test_with::env(CI)] + #[test_with::no_env(VORTEX_SKIP_SLOW_TESTS)] + fn build_views_offsets_overflow_i32() { + const STRING_LEN: usize = 64 * 1024; + // Comfortably past MAX_BUFFER_LEN (`i32::MAX` ~= 2.0 GiB) so the heap must roll over. + const TOTAL_BYTES: usize = (1usize << 31) + (256 << 20); // ~2.25 GiB + const N: usize = TOTAL_BYTES / STRING_LEN; + + // Each value's first 8 bytes encode its row index, so a misrouted offset is detectable. + let nth_string = |i: usize| { + let mut s = vec![b'x'; STRING_LEN]; + s[..8].copy_from_slice(&(i as u64).to_le_bytes()); + s + }; + + let mut bytes = ByteBufferMut::with_capacity(N * STRING_LEN); + let mut value = vec![b'x'; STRING_LEN]; + for i in 0..N { + value[..8].copy_from_slice(&(i as u64).to_le_bytes()); + bytes.extend_from_slice(&value); + } + + let lens = vec![u32::try_from(STRING_LEN).unwrap(); N]; + let (buffers, views) = build_views(0, MAX_BUFFER_LEN, bytes, &lens); + + assert_eq!(views.len(), N); + assert!( + buffers.len() >= 2, + "heap exceeding MAX_BUFFER_LEN must roll over into multiple buffers, got {}", + buffers.len() + ); + for (i, b) in buffers.iter().enumerate() { + assert!( + b.len() <= MAX_BUFFER_LEN, + "buffer {i} of {} bytes exceeds MAX_BUFFER_LEN", + b.len() + ); + } + + // The boundary row is the first whose offset would cross MAX_BUFFER_LEN on the fast path. + let boundary = MAX_BUFFER_LEN / STRING_LEN; + for i in [0, boundary - 1, boundary, boundary + 1, N / 2, N - 1] { + let view = &views[i]; + let r = view.as_view(); + let got = &buffers[r.buffer_index as usize][r.as_range()]; + assert_eq!(got, nth_string(i).as_slice(), "value mismatch at row {i}"); + assert_eq!(r.size as usize, STRING_LEN); + } + } + #[test] fn test_to_canonical_large() { // We are testing generating views for raw data that should look like