From 94be61da468b55b476ac11eb3282cdc6a6eff59a Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 08:38:50 +0000 Subject: [PATCH 01/27] Add experimental OnPair string encoding backed by the onpair crate Adds the vortex-onpair array encoding under encodings/experimental/onpair, sourcing the compression algorithm from the standalone `onpair` crate (local path dependency) rather than vendored code. - vortex-onpair: Vortex array wrapping, serialisation, and cast/filter pushdown only; train/encode/decode live in the onpair crate. - btrblocks: register OnPairScheme alongside FSSTScheme so the sample-based selector keeps the smaller per column; delta-encode the monotonic dict_offsets/codes_offsets children (>= 2048 rows) when it wins. - vortex-file: register the OnPair encoding and allow it in the write strategy. Note: onpair is a local path dependency for now (to be published to crates.io). Co-Authored-By: Claude Opus 4.7 Signed-off-by: Joe Isaacs --- Cargo.lock | 26 + Cargo.toml | 4 + encodings/experimental/onpair/Cargo.toml | 39 ++ encodings/experimental/onpair/README.md | 39 ++ .../experimental/onpair/benches/decode.rs | 208 +++++++ .../onpair/goldenfiles/onpair.metadata | 1 + encodings/experimental/onpair/public-api.lock | 263 ++++++++ encodings/experimental/onpair/src/array.rs | 565 ++++++++++++++++++ .../experimental/onpair/src/canonical.rs | 85 +++ encodings/experimental/onpair/src/compress.rs | 165 +++++ .../experimental/onpair/src/compute/cast.rs | 55 ++ .../experimental/onpair/src/compute/filter.rs | 117 ++++ .../experimental/onpair/src/compute/mod.rs | 5 + encodings/experimental/onpair/src/decode.rs | 347 +++++++++++ encodings/experimental/onpair/src/kernel.rs | 16 + encodings/experimental/onpair/src/lib.rs | 33 + encodings/experimental/onpair/src/ops.rs | 27 + encodings/experimental/onpair/src/rules.rs | 13 + encodings/experimental/onpair/src/slice.rs | 42 ++ encodings/experimental/onpair/src/tests.rs | 387 ++++++++++++ .../experimental/onpair/tests/big_data.rs | 163 +++++ vortex-btrblocks/Cargo.toml | 3 +- vortex-btrblocks/src/builder.rs | 16 +- vortex-btrblocks/src/schemes/integer.rs | 2 +- vortex-btrblocks/src/schemes/string.rs | 242 +++++++- vortex-btrblocks/tests/onpair_roundtrip.rs | 153 +++++ vortex-file/Cargo.toml | 2 + vortex-file/src/lib.rs | 4 + vortex-file/src/strategy.rs | 4 + .../tests/test_onpair_string_roundtrip.rs | 404 +++++++++++++ 30 files changed, 3424 insertions(+), 6 deletions(-) create mode 100644 encodings/experimental/onpair/Cargo.toml create mode 100644 encodings/experimental/onpair/README.md create mode 100644 encodings/experimental/onpair/benches/decode.rs create mode 100644 encodings/experimental/onpair/goldenfiles/onpair.metadata create mode 100644 encodings/experimental/onpair/public-api.lock create mode 100644 encodings/experimental/onpair/src/array.rs create mode 100644 encodings/experimental/onpair/src/canonical.rs create mode 100644 encodings/experimental/onpair/src/compress.rs create mode 100644 encodings/experimental/onpair/src/compute/cast.rs create mode 100644 encodings/experimental/onpair/src/compute/filter.rs create mode 100644 encodings/experimental/onpair/src/compute/mod.rs create mode 100644 encodings/experimental/onpair/src/decode.rs create mode 100644 encodings/experimental/onpair/src/kernel.rs create mode 100644 encodings/experimental/onpair/src/lib.rs create mode 100644 encodings/experimental/onpair/src/ops.rs create mode 100644 encodings/experimental/onpair/src/rules.rs create mode 100644 encodings/experimental/onpair/src/slice.rs create mode 100644 encodings/experimental/onpair/src/tests.rs create mode 100644 encodings/experimental/onpair/tests/big_data.rs create mode 100644 vortex-btrblocks/tests/onpair_roundtrip.rs create mode 100644 vortex-file/tests/test_onpair_string_roundtrip.rs diff --git a/Cargo.lock b/Cargo.lock index 045c72176fd..c74f1a876de 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5723,6 +5723,14 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfe21416a02c693fb9f980befcb230ecc70b0b3d1cc4abf88b9675c4c1457f0c" +[[package]] +name = "onpair" +version = "0.0.1" +dependencies = [ + "hashbrown 0.16.1", + "rand 0.9.4", +] + [[package]] name = "oorandom" version = "11.1.5" @@ -9340,6 +9348,7 @@ dependencies = [ "vortex-fastlanes", "vortex-fsst", "vortex-mask", + "vortex-onpair", "vortex-pco", "vortex-runend", "vortex-sequence", @@ -9682,6 +9691,7 @@ dependencies = [ "vortex-layout", "vortex-mask", "vortex-metrics", + "vortex-onpair", "vortex-pco", "vortex-runend", "vortex-scan", @@ -9889,6 +9899,22 @@ dependencies = [ "vortex-cuda-macros", ] +[[package]] +name = "vortex-onpair" +version = "0.1.0" +dependencies = [ + "codspeed-divan-compat", + "memchr", + "onpair", + "prost 0.14.3", + "rstest", + "vortex-array", + "vortex-buffer", + "vortex-error", + "vortex-mask", + "vortex-session", +] + [[package]] name = "vortex-parquet-variant" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index af46038fd58..a696d96d8ae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -54,6 +54,8 @@ members = [ "encodings/zstd", "encodings/bytebool", "encodings/parquet-variant", + # Experimental encodings + "encodings/experimental/onpair", # Benchmarks "benchmarks/lance-bench", "benchmarks/compress-bench", @@ -188,6 +190,7 @@ num_enum = { version = "0.7.3", default-features = false } object_store = { version = "0.13.1", default-features = false } once_cell = "1.21" oneshot = { version = "0.2.0", features = ["async"] } +onpair = { path = "../onpair" } opentelemetry = "0.32.0" opentelemetry-otlp = "0.32.0" opentelemetry_sdk = "0.32.0" @@ -288,6 +291,7 @@ vortex-ipc = { version = "0.1.0", path = "./vortex-ipc", default-features = fals vortex-layout = { version = "0.1.0", path = "./vortex-layout", default-features = false } vortex-mask = { version = "0.1.0", path = "./vortex-mask", default-features = false } vortex-metrics = { version = "0.1.0", path = "./vortex-metrics", default-features = false } +vortex-onpair = { version = "0.1.0", path = "./encodings/experimental/onpair", default-features = false } vortex-parquet-variant = { version = "0.1.0", path = "./encodings/parquet-variant" } vortex-pco = { version = "0.1.0", path = "./encodings/pco", default-features = false } vortex-proto = { version = "0.1.0", path = "./vortex-proto", default-features = false } diff --git a/encodings/experimental/onpair/Cargo.toml b/encodings/experimental/onpair/Cargo.toml new file mode 100644 index 00000000000..ba8c478570b --- /dev/null +++ b/encodings/experimental/onpair/Cargo.toml @@ -0,0 +1,39 @@ +[package] +name = "vortex-onpair" +authors = { workspace = true } +categories = { workspace = true } +description = "Vortex OnPair string array encoding (dict-12, pushdown predicates)" +edition = { workspace = true } +homepage = { workspace = true } +include = { workspace = true } +keywords = { workspace = true } +license = { workspace = true } +readme = "README.md" +repository = { workspace = true } +rust-version = { workspace = true } +version = { workspace = true } + +[lints] +workspace = true + +[dependencies] +memchr = { workspace = true } +onpair = { workspace = true } +prost = { workspace = true } +vortex-array = { workspace = true } +vortex-buffer = { workspace = true } +vortex-error = { workspace = true } +vortex-mask = { workspace = true } +vortex-session = { workspace = true } + +[features] +_test-harness = ["vortex-array/_test-harness"] + +[dev-dependencies] +divan = { workspace = true } +rstest = { workspace = true } +vortex-array = { workspace = true, features = ["_test-harness"] } + +[[bench]] +name = "decode" +harness = false diff --git a/encodings/experimental/onpair/README.md b/encodings/experimental/onpair/README.md new file mode 100644 index 00000000000..9628c006201 --- /dev/null +++ b/encodings/experimental/onpair/README.md @@ -0,0 +1,39 @@ +# Vortex OnPair + +A Vortex Encoding for Binary and Utf8 data that uses the +[OnPair][onpair] short-string compression algorithm. OnPair is a +dictionary-based encoder with fast per-row random access. + +The trainer / encoder lives in the standalone [`onpair`][onpair-crate] +crate; this crate wraps the resulting column as a Vortex array with +cascading-compressor support on every integer child. + +## Compute + +Like the FSST encoding, this crate provides `cast` and `filter` +pushdown. Other operators fall back to ordinary decompression. + +## Default Configuration + +The default training preset is **dict-12**: 12 bits per token, +dictionary capped at 4 096 entries. Token codes are stored as a +`PrimitiveArray`; downstream `FastLanes::BitPacking` losslessly +narrows the child to exactly `bits`-bit codes on disk. + +## Layout + +- Buffer 0 — `dict_bytes`: dictionary blob built by the OnPair trainer, + padded with `MAX_TOKEN_SIZE` trailing zero bytes so the over-copy + decoder can read 16 bytes past the last token. +- Slot 0 — `dict_offsets`: `PrimitiveArray`, len `dict_size + 1`. +- Slot 1 — `codes`: `PrimitiveArray`, length `total_tokens`. +- Slot 2 — `codes_offsets`: `PrimitiveArray`, length `num_rows + 1`. +- Slot 3 — `uncompressed_lengths`: integer `PrimitiveArray`, length + `num_rows`. +- Slot 4 — optional validity child. + +All four integer slot children flow through the standard cascading +compressor pipeline (FoR / BitPacking / RunEnd / etc.). + +[onpair]: https://arxiv.org/abs/2508.02280 +[onpair-crate]: https://github.com/spiraldb/onpair diff --git a/encodings/experimental/onpair/benches/decode.rs b/encodings/experimental/onpair/benches/decode.rs new file mode 100644 index 00000000000..52a4ea77d87 --- /dev/null +++ b/encodings/experimental/onpair/benches/decode.rs @@ -0,0 +1,208 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Decode-path microbenchmarks for the OnPair Vortex array. +//! +//! * `decode_rows_unchecked` — the production decoder hot loop (combined +//! `(offset << 16) | length` table, fixed 16-byte over-copy, 4× unrolled). +//! Measured by hand-driving `DecodeView::decode_rows_unchecked` straight +//! into a `Vec` so the time reflects the inner loop only. +//! * `canonicalize_to_varbinview` — the full Vortex +//! `OnPair → VarBinViewArray` path callers actually hit. Includes +//! `OwnedDecodeInputs::collect`, the build_views step, allocation, etc. +//! +//! Each bench sweeps four corpus shapes against two row counts to surface +//! cache-pressure cliffs and per-row decode cost. + +#![allow( + clippy::cast_possible_truncation, + clippy::cast_lossless, + clippy::panic, + clippy::tests_outside_test_module, + clippy::redundant_clone, + clippy::missing_safety_doc, + clippy::unwrap_used, + clippy::expect_used +)] + +use std::sync::LazyLock; + +use divan::Bencher; +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::VarBinArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::filter::FilterKernel; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::session::ArraySession; +use vortex_mask::Mask; +use vortex_onpair::DEFAULT_DICT12_CONFIG; +use vortex_onpair::MAX_TOKEN_SIZE; +use vortex_onpair::OnPair; +use vortex_onpair::OnPairArray; +use vortex_onpair::decode::OwnedDecodeInputs; +use vortex_onpair::onpair_compress; +use vortex_session::VortexSession; + +static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + +#[derive(Copy, Clone, Debug)] +enum Shape { + /// URL / HTTP-log shaped — high lexical overlap, ~35–45 bytes per row. + UrlLog, + /// Short uniform strings — 4–8 bytes per row, very low cardinality. + Short, + /// Long log-line shaped — ~120 bytes per row, more tokens per row. + Long, + /// High cardinality — every row unique. + HighCard, +} + +fn corpus(n: usize, shape: Shape) -> Vec { + let mut state = 0x9e37_79b9_7f4a_7c15_u64; + let mut next = || { + state = state + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + state + }; + let mut out = Vec::with_capacity(n); + match shape { + Shape::UrlLog => { + let templates: &[&str] = &[ + "https://www.example.com/products/{id}", + "https://cdn.example.com/img/{id}.webp", + "https://api.example.com/v2/orders/{id}", + "https://www.example.com/users/{id}/profile", + "INFO request_id={id} status=200 method=GET", + "WARN request_id={id} status=429 method=POST", + "ERROR request_id={id} status=500 method=PUT", + ]; + for _ in 0..n { + let s = next(); + let pick = (s as usize) % templates.len(); + let id = s as u32; + out.push(templates[pick].replace("{id}", &format!("{id:08x}"))); + } + } + Shape::Short => { + let templates: &[&str] = &["alpha", "beta", "gamma", "delta", "eps", "zeta", "eta"]; + for _ in 0..n { + let s = next(); + out.push(templates[(s as usize) % templates.len()].to_string()); + } + } + Shape::Long => { + let templates: &[&str] = &[ + "2026-05-14T12:34:56.789012Z INFO request_id={id} method=GET path=/api/v1/users/{id}/profile status=200", + "2026-05-14T12:34:56.789012Z WARN request_id={id} method=POST path=/api/v1/users/{id}/sessions status=429", + "2026-05-14T12:34:56.789012Z ERROR request_id={id} method=PUT path=/api/v1/users/{id}/settings status=500", + ]; + for _ in 0..n { + let s = next(); + let pick = (s as usize) % templates.len(); + let id = s as u32; + out.push(templates[pick].replace("{id}", &format!("{id:08x}"))); + } + } + Shape::HighCard => { + for i in 0..n { + out.push(format!("row-{i:010x}-{rand:016x}", rand = next())); + } + } + } + out +} + +fn compress(n: usize, shape: Shape) -> OnPairArray { + let strings = corpus(n, shape); + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_bytes())), + DType::Utf8(Nullability::NonNullable), + ); + onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG) + .unwrap_or_else(|e| panic!("onpair_compress failed: {e}")) +} + +fn materialise(arr: &OnPairArray) -> (OwnedDecodeInputs, usize, usize) { + let mut ctx = SESSION.create_execution_ctx(); + let inputs = OwnedDecodeInputs::collect(arr.as_view(), &mut ctx) + .unwrap_or_else(|e| panic!("collect: {e}")); + let n = arr.len(); + let total: usize = inputs + .codes + .as_slice() + .iter() + .map(|&c| (inputs.dict_table.as_slice()[c as usize] & 0xffff) as usize) + .sum(); + (inputs, n, total) +} + +const CASES: &[(Shape, usize)] = &[ + (Shape::UrlLog, 100_000), + (Shape::UrlLog, 1_000_000), + (Shape::Short, 100_000), + (Shape::Long, 100_000), + (Shape::HighCard, 100_000), +]; + +/// Raw decode loop time, excluding `OwnedDecodeInputs::collect` and the +/// output allocation. Hits `DecodeView::decode_rows_unchecked` directly. +#[divan::bench(args = CASES)] +fn decode_rows_unchecked(bencher: Bencher, case: (Shape, usize)) { + let (shape, n) = case; + let arr = compress(n, shape); + let (inputs, n_rows, total) = materialise(&arr); + bencher.bench_local(|| { + let mut out: Vec = Vec::with_capacity(total + MAX_TOKEN_SIZE); + let dv = inputs.view(); + unsafe { + let written = dv.decode_rows_unchecked(0, n_rows, out.as_mut_ptr()); + out.set_len(written); + } + divan::black_box(out); + }); +} + +/// Full Vortex canonicalisation, including `execute<>` on every child, +/// building the view buffer + `BinaryView` list, etc. +#[divan::bench(args = CASES)] +fn canonicalize_to_varbinview(bencher: Bencher, case: (Shape, usize)) { + let (shape, n) = case; + let arr = compress(n, shape); + bencher + .with_inputs(|| arr.clone().into_array()) + .bench_local_values(|arr| { + let mut ctx = SESSION.create_execution_ctx(); + divan::black_box( + arr.execute::(&mut ctx) + .unwrap_or_else(|e| panic!("canonicalize failed: {e}")), + ) + }); +} + +// ─── Compute kernels ───────────────────────────────────────────────────── + +const COMPUTE_CASES: &[(Shape, usize)] = &[(Shape::UrlLog, 100_000), (Shape::UrlLog, 1_000_000)]; + +/// Filter — share-dict path. Builds a 1-in-7 mask so we keep ~14 % of +/// rows; the cost is dominated by the `codes` segment copy + offsets. +#[divan::bench(args = COMPUTE_CASES)] +fn filter_share_dict(bencher: Bencher, case: (Shape, usize)) { + let (shape, n) = case; + let arr = compress(n, shape); + let mask = Mask::from_iter((0..n).map(|i| i % 7 == 0)); + bencher.bench_local(|| { + let mut ctx = SESSION.create_execution_ctx(); + let result = ::filter(arr.as_view(), &mask, &mut ctx) + .unwrap() + .unwrap(); + divan::black_box(result); + }); +} + +fn main() { + divan::main(); +} diff --git a/encodings/experimental/onpair/goldenfiles/onpair.metadata b/encodings/experimental/onpair/goldenfiles/onpair.metadata new file mode 100644 index 00000000000..e96baf1a0ab --- /dev/null +++ b/encodings/experimental/onpair/goldenfiles/onpair.metadata @@ -0,0 +1 @@ + € €è(08 \ No newline at end of file diff --git a/encodings/experimental/onpair/public-api.lock b/encodings/experimental/onpair/public-api.lock new file mode 100644 index 00000000000..bf1694761b3 --- /dev/null +++ b/encodings/experimental/onpair/public-api.lock @@ -0,0 +1,263 @@ +pub mod vortex_onpair + +pub mod vortex_onpair::decode + +pub struct vortex_onpair::decode::DecodeView<'a> + +pub vortex_onpair::decode::DecodeView::codes: &'a [u16] + +pub vortex_onpair::decode::DecodeView::codes_offsets: &'a [u32] + +pub vortex_onpair::decode::DecodeView::dict_bytes: &'a [u8] + +pub vortex_onpair::decode::DecodeView::dict_table: &'a [u64] + +impl<'a> vortex_onpair::decode::DecodeView<'a> + +pub fn vortex_onpair::decode::DecodeView<'a>::decode_row_into(&self, usize, &mut alloc::vec::Vec) + +pub fn vortex_onpair::decode::DecodeView<'a>::decode_rows_into(&self, usize, usize, &mut alloc::vec::Vec) + +pub unsafe fn vortex_onpair::decode::DecodeView<'a>::decode_rows_into_with_size(&self, usize, usize, usize, &mut alloc::vec::Vec) + +pub unsafe fn vortex_onpair::decode::DecodeView<'a>::decode_rows_unchecked(&self, usize, usize, *mut u8) -> usize + +pub fn vortex_onpair::decode::DecodeView<'a>::decoded_len(&self, usize) -> usize + +pub fn vortex_onpair::decode::DecodeView<'a>::decoded_len_rows(&self, usize, usize) -> usize + +pub fn vortex_onpair::decode::DecodeView<'a>::for_each_dict_slice bool>(&self, usize, F) -> bool + +impl<'a> core::clone::Clone for vortex_onpair::decode::DecodeView<'a> + +pub fn vortex_onpair::decode::DecodeView<'a>::clone(&self) -> vortex_onpair::decode::DecodeView<'a> + +impl<'a> core::marker::Copy for vortex_onpair::decode::DecodeView<'a> + +pub struct vortex_onpair::decode::OwnedDecodeInputs + +pub vortex_onpair::decode::OwnedDecodeInputs::codes: vortex_buffer::buffer::Buffer + +pub vortex_onpair::decode::OwnedDecodeInputs::codes_offsets: vortex_buffer::buffer::Buffer + +pub vortex_onpair::decode::OwnedDecodeInputs::dict_bytes: vortex_buffer::ByteBuffer + +pub vortex_onpair::decode::OwnedDecodeInputs::dict_table: vortex_buffer::buffer::Buffer + +impl vortex_onpair::decode::OwnedDecodeInputs + +pub fn vortex_onpair::decode::OwnedDecodeInputs::collect(vortex_array::array::view::ArrayView<'_, vortex_onpair::OnPair>, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + +pub fn vortex_onpair::decode::OwnedDecodeInputs::view(&self) -> vortex_onpair::decode::DecodeView<'_> + +pub struct vortex_onpair::OnPair + +impl vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::try_new(vortex_array::dtype::DType, vortex_array::buffer::BufferHandle, vortex_array::array::erased::ArrayRef, vortex_array::array::erased::ArrayRef, vortex_array::array::erased::ArrayRef, vortex_array::array::erased::ArrayRef, vortex_array::validity::Validity, u32) -> vortex_error::VortexResult + +impl core::clone::Clone for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::clone(&self) -> vortex_onpair::OnPair + +impl core::fmt::Debug for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl vortex_array::array::vtable::VTable for vortex_onpair::OnPair + +pub type vortex_onpair::OnPair::OperationsVTable = vortex_onpair::OnPair + +pub type vortex_onpair::OnPair::TypedArrayData = vortex_onpair::OnPairData + +pub type vortex_onpair::OnPair::ValidityVTable = vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::append_to_builder(vortex_array::array::view::ArrayView<'_, Self>, &mut dyn vortex_array::builders::ArrayBuilder, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()> + +pub fn vortex_onpair::OnPair::buffer(vortex_array::array::view::ArrayView<'_, Self>, usize) -> vortex_array::buffer::BufferHandle + +pub fn vortex_onpair::OnPair::buffer_name(vortex_array::array::view::ArrayView<'_, Self>, usize) -> core::option::Option + +pub fn vortex_onpair::OnPair::deserialize(&self, &vortex_array::dtype::DType, usize, &[u8], &[vortex_array::buffer::BufferHandle], &dyn vortex_array::serde::ArrayChildren, &vortex_session::VortexSession) -> vortex_error::VortexResult> + +pub fn vortex_onpair::OnPair::execute(vortex_array::array::typed::Array, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + +pub fn vortex_onpair::OnPair::execute_parent(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, usize, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + +pub fn vortex_onpair::OnPair::id(&self) -> vortex_array::array::ArrayId + +pub fn vortex_onpair::OnPair::nbuffers(vortex_array::array::view::ArrayView<'_, Self>) -> usize + +pub fn vortex_onpair::OnPair::reduce_parent(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, usize) -> vortex_error::VortexResult> + +pub fn vortex_onpair::OnPair::serialize(vortex_array::array::view::ArrayView<'_, Self>, &vortex_session::VortexSession) -> vortex_error::VortexResult>> + +pub fn vortex_onpair::OnPair::slot_name(vortex_array::array::view::ArrayView<'_, Self>, usize) -> alloc::string::String + +pub fn vortex_onpair::OnPair::validate(&self, &Self::TypedArrayData, &vortex_array::dtype::DType, usize, &[core::option::Option]) -> vortex_error::VortexResult<()> + +impl vortex_array::array::vtable::operations::OperationsVTable for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::scalar_at(vortex_array::array::view::ArrayView<'_, vortex_onpair::OnPair>, usize, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + +impl vortex_array::array::vtable::validity::ValidityVTable for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::validity(vortex_array::array::view::ArrayView<'_, vortex_onpair::OnPair>) -> vortex_error::VortexResult + +impl vortex_array::arrays::filter::kernel::FilterKernel for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::filter(vortex_array::array::view::ArrayView<'_, Self>, &vortex_mask::Mask, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + +impl vortex_array::arrays::slice::SliceReduce for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::slice(vortex_array::array::view::ArrayView<'_, Self>, core::ops::range::Range) -> vortex_error::VortexResult> + +impl vortex_array::scalar_fn::fns::binary::compare::CompareKernel for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::compare(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, vortex_array::scalar_fn::fns::operators::CompareOperator, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + +impl vortex_array::scalar_fn::fns::cast::kernel::CastKernel for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::cast(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::dtype::DType, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + +impl vortex_array::scalar_fn::fns::cast::kernel::CastReduce for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::cast(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::dtype::DType) -> vortex_error::VortexResult> + +impl vortex_array::scalar_fn::fns::like::kernel::LikeKernel for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::like(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, vortex_array::scalar_fn::fns::like::LikeOptions, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + +pub struct vortex_onpair::OnPairData + +impl vortex_onpair::OnPairData + +pub fn vortex_onpair::OnPairData::bits(&self) -> u32 + +pub fn vortex_onpair::OnPairData::dict_bytes(&self) -> &vortex_buffer::ByteBuffer + +pub fn vortex_onpair::OnPairData::dict_bytes_handle(&self) -> &vortex_array::buffer::BufferHandle + +pub fn vortex_onpair::OnPairData::is_empty(&self) -> bool + +pub fn vortex_onpair::OnPairData::len(&self) -> usize + +pub fn vortex_onpair::OnPairData::new(vortex_array::buffer::BufferHandle, u32, usize) -> Self + +impl core::clone::Clone for vortex_onpair::OnPairData + +pub fn vortex_onpair::OnPairData::clone(&self) -> vortex_onpair::OnPairData + +impl core::fmt::Debug for vortex_onpair::OnPairData + +pub fn vortex_onpair::OnPairData::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::fmt::Display for vortex_onpair::OnPairData + +pub fn vortex_onpair::OnPairData::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl vortex_array::hash::ArrayEq for vortex_onpair::OnPairData + +pub fn vortex_onpair::OnPairData::array_eq(&self, &Self, vortex_array::hash::Precision) -> bool + +impl vortex_array::hash::ArrayHash for vortex_onpair::OnPairData + +pub fn vortex_onpair::OnPairData::array_hash(&self, &mut H, vortex_array::hash::Precision) + +pub struct vortex_onpair::OnPairMetadata + +pub vortex_onpair::OnPairMetadata::bits: u32 + +pub vortex_onpair::OnPairMetadata::codes_offsets_ptype: i32 + +pub vortex_onpair::OnPairMetadata::codes_ptype: i32 + +pub vortex_onpair::OnPairMetadata::dict_offsets_ptype: i32 + +pub vortex_onpair::OnPairMetadata::dict_size: u64 + +pub vortex_onpair::OnPairMetadata::total_tokens: u64 + +pub vortex_onpair::OnPairMetadata::uncompressed_lengths_ptype: i32 + +impl vortex_onpair::OnPairMetadata + +pub fn vortex_onpair::OnPairMetadata::codes_offsets_ptype(&self) -> vortex_array::dtype::ptype::PType + +pub fn vortex_onpair::OnPairMetadata::codes_ptype(&self) -> vortex_array::dtype::ptype::PType + +pub fn vortex_onpair::OnPairMetadata::dict_offsets_ptype(&self) -> vortex_array::dtype::ptype::PType + +pub fn vortex_onpair::OnPairMetadata::set_codes_offsets_ptype(&mut self, vortex_array::dtype::ptype::PType) + +pub fn vortex_onpair::OnPairMetadata::set_codes_ptype(&mut self, vortex_array::dtype::ptype::PType) + +pub fn vortex_onpair::OnPairMetadata::set_dict_offsets_ptype(&mut self, vortex_array::dtype::ptype::PType) + +pub fn vortex_onpair::OnPairMetadata::set_uncompressed_lengths_ptype(&mut self, vortex_array::dtype::ptype::PType) + +pub fn vortex_onpair::OnPairMetadata::uncompressed_lengths_ptype(&self) -> vortex_array::dtype::ptype::PType + +impl vortex_onpair::OnPairMetadata + +pub fn vortex_onpair::OnPairMetadata::get_uncompressed_lengths_ptype(&self) -> vortex_error::VortexResult + +impl core::clone::Clone for vortex_onpair::OnPairMetadata + +pub fn vortex_onpair::OnPairMetadata::clone(&self) -> vortex_onpair::OnPairMetadata + +impl core::default::Default for vortex_onpair::OnPairMetadata + +pub fn vortex_onpair::OnPairMetadata::default() -> Self + +impl core::fmt::Debug for vortex_onpair::OnPairMetadata + +pub fn vortex_onpair::OnPairMetadata::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl prost::message::Message for vortex_onpair::OnPairMetadata + +pub fn vortex_onpair::OnPairMetadata::clear(&mut self) + +pub fn vortex_onpair::OnPairMetadata::encoded_len(&self) -> usize + +pub const vortex_onpair::DEFAULT_BITS: u32 + +pub const vortex_onpair::DEFAULT_DICT12_CONFIG: onpair::Config + +pub const vortex_onpair::MAX_TOKEN_SIZE: usize + +pub trait vortex_onpair::OnPairArrayExt: vortex_array::array::typed::TypedArrayRef + +pub fn vortex_onpair::OnPairArrayExt::array_validity(&self) -> vortex_array::validity::Validity + +pub fn vortex_onpair::OnPairArrayExt::codes(&self) -> &vortex_array::array::erased::ArrayRef + +pub fn vortex_onpair::OnPairArrayExt::codes_offsets(&self) -> &vortex_array::array::erased::ArrayRef + +pub fn vortex_onpair::OnPairArrayExt::dict_offsets(&self) -> &vortex_array::array::erased::ArrayRef + +pub fn vortex_onpair::OnPairArrayExt::uncompressed_lengths(&self) -> &vortex_array::array::erased::ArrayRef + +impl> vortex_onpair::OnPairArrayExt for T + +pub fn T::array_validity(&self) -> vortex_array::validity::Validity + +pub fn T::codes(&self) -> &vortex_array::array::erased::ArrayRef + +pub fn T::codes_offsets(&self) -> &vortex_array::array::erased::ArrayRef + +pub fn T::dict_offsets(&self) -> &vortex_array::array::erased::ArrayRef + +pub fn T::uncompressed_lengths(&self) -> &vortex_array::array::erased::ArrayRef + +pub fn vortex_onpair::config_with_bits(u32) -> onpair::Config + +pub fn vortex_onpair::onpair_compress>(A, usize, &vortex_array::dtype::DType, onpair::Config) -> vortex_error::VortexResult + +pub fn vortex_onpair::onpair_compress_array(&vortex_array::array::erased::ArrayRef, onpair::Config, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + +pub fn vortex_onpair::onpair_compress_array_default(&vortex_array::array::erased::ArrayRef, onpair::Config) -> vortex_error::VortexResult + +pub fn vortex_onpair::onpair_compress_iter<'a, I>(I, usize, vortex_array::dtype::DType, onpair::Config) -> vortex_error::VortexResult where I: core::iter::traits::iterator::Iterator> + +pub type vortex_onpair::OnPairArray = vortex_array::array::typed::Array diff --git a/encodings/experimental/onpair/src/array.rs b/encodings/experimental/onpair/src/array.rs new file mode 100644 index 00000000000..f39656cec58 --- /dev/null +++ b/encodings/experimental/onpair/src/array.rs @@ -0,0 +1,565 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::fmt::Debug; +use std::fmt::Display; +use std::fmt::Formatter; +use std::hash::Hasher; + +use prost::Message as _; +use vortex_array::Array; +use vortex_array::ArrayEq; +use vortex_array::ArrayHash; +use vortex_array::ArrayId; +use vortex_array::ArrayParts; +use vortex_array::ArrayRef; +use vortex_array::ArraySlots; +use vortex_array::ArrayView; +use vortex_array::Canonical; +use vortex_array::ExecutionCtx; +use vortex_array::ExecutionResult; +use vortex_array::IntoArray; +use vortex_array::Precision; +use vortex_array::TypedArrayRef; +use vortex_array::buffer::BufferHandle; +use vortex_array::builders::ArrayBuilder; +use vortex_array::builders::VarBinViewBuilder; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::serde::ArrayChildren; +use vortex_array::smallvec::smallvec; +use vortex_array::validity::Validity; +use vortex_array::vtable::VTable; +use vortex_array::vtable::ValidityVTable; +use vortex_array::vtable::child_to_validity; +use vortex_array::vtable::validity_to_child; +use vortex_buffer::ByteBuffer; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_ensure; +use vortex_error::vortex_err; +use vortex_error::vortex_panic; +use vortex_session::VortexSession; +use vortex_session::registry::CachedId; + +use crate::canonical::canonicalize_onpair; +use crate::canonical::onpair_decode_views; +use crate::kernel::PARENT_KERNELS; +use crate::rules::RULES; + +/// An [`OnPair`]-encoded Vortex array. +pub type OnPairArray = Array; + +/// Default bits-per-token preset used by [`crate::onpair_compress`]: 12-bit +/// codes, dictionary capped at 4 096 entries. +pub const DEFAULT_BITS: u32 = 12; + +/// Wire-format metadata persisted alongside the OnPair buffer + slot children. +/// +/// On disk the layout is FSST-shape: +/// +/// * Buffer 0 — `dict_bytes`: the dictionary blob built by the OnPair trainer, +/// padded with [`MAX_TOKEN_SIZE`][crate::MAX_TOKEN_SIZE] trailing zero +/// bytes so the over-copy decoder can read 16 bytes past the last token. +/// * Slot 0 — `dict_offsets`: `PrimitiveArray`, len `dict_size + 1`. +/// * Slot 1 — `codes`: `PrimitiveArray`. Each value only uses its low +/// `bits` bits; downstream `FastLanes::BitPacking` losslessly shrinks +/// the child to exactly `bits`-bit codes on disk. +/// * Slot 2 — `codes_offsets`: `PrimitiveArray`, len `num_rows + 1`. +/// FoR / RunEnd / etc. apply naturally via the cascading compressor. +/// * Slot 3 — `uncompressed_lengths`: integer `PrimitiveArray`, len +/// `num_rows`. Used to size the canonical output buffer. +/// * Slot 4 — optional validity child. +/// +/// All three integer slot children flow through the standard +/// `compress_child` pipeline (see `vortex-btrblocks::schemes::string:: +/// OnPairScheme`), so any encoding registered with the compressor can +/// re-encode them — exactly the same shape as FSST's `codes` `VarBinArray`. +#[derive(Clone, prost::Message)] +pub struct OnPairMetadata { + /// Width of the per-row primitive `uncompressed_lengths` child. + #[prost(enumeration = "PType", tag = "1")] + pub uncompressed_lengths_ptype: i32, + /// Bits-per-token the column was compressed with (9..=16). Every value + /// in the `codes` child only uses its low `bits` bits. + #[prost(uint32, tag = "2")] + pub bits: u32, + /// Number of dictionary tokens. `dict_offsets` has length `dict_size + 1`. + #[prost(uint64, tag = "3")] + pub dict_size: u64, + /// Total number of tokens across all rows. `codes` has this length; + /// `codes_offsets.last() == total_tokens`. + #[prost(uint64, tag = "4")] + pub total_tokens: u64, + /// PType of the `dict_offsets` slot child (defaults to U32, may be + /// narrowed to U16/U8 by the cascading compressor when values fit). + #[prost(enumeration = "PType", tag = "5")] + pub dict_offsets_ptype: i32, + /// PType of the `codes` slot child (typically U16, may be narrowed to U8 + /// when `bits <= 8`). + #[prost(enumeration = "PType", tag = "6")] + pub codes_ptype: i32, + /// PType of the `codes_offsets` slot child. + #[prost(enumeration = "PType", tag = "7")] + pub codes_offsets_ptype: i32, +} + +impl OnPairMetadata { + pub fn get_uncompressed_lengths_ptype(&self) -> VortexResult { + PType::try_from(self.uncompressed_lengths_ptype) + .map_err(|_| vortex_err!("Invalid PType {}", self.uncompressed_lengths_ptype)) + } +} + +/// Slot indices on the outer [`Array`]. +pub(crate) const DICT_OFFSETS_SLOT: usize = 0; +pub(crate) const CODES_SLOT: usize = 1; +pub(crate) const CODES_OFFSETS_SLOT: usize = 2; +pub(crate) const UNCOMPRESSED_LENGTHS_SLOT: usize = 3; +pub(crate) const VALIDITY_SLOT: usize = 4; +pub(crate) const NUM_SLOTS: usize = 5; +pub(crate) const SLOT_NAMES: [&str; NUM_SLOTS] = [ + "dict_offsets", + "codes", + "codes_offsets", + "uncompressed_lengths", + "validity", +]; + +/// Inner data for an OnPair-encoded array. +/// +/// Holds only the dictionary blob (buffer 0). Every other piece — +/// `dict_offsets`, the per-token `codes`, the per-row `codes_offsets`, the +/// per-row `uncompressed_lengths`, and the optional validity child — is a +/// Vortex slot child so it can be re-encoded by the cascading compressor. +#[derive(Clone)] +pub struct OnPairData { + dict_bytes: BufferHandle, + bits: u32, + len: usize, +} + +impl OnPairData { + pub fn new(dict_bytes: BufferHandle, bits: u32, len: usize) -> Self { + Self { + dict_bytes, + bits, + len, + } + } + + pub fn len(&self) -> usize { + self.len + } + + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + pub fn bits(&self) -> u32 { + self.bits + } + + pub fn dict_bytes(&self) -> &ByteBuffer { + self.dict_bytes.as_host() + } + + pub fn dict_bytes_handle(&self) -> &BufferHandle { + &self.dict_bytes + } +} + +impl Display for OnPairData { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "len: {}, bits: {}, dict_bytes_len: {}", + self.len, + self.bits, + self.dict_bytes.len() + ) + } +} + +impl Debug for OnPairData { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("OnPairData") + .field("len", &self.len) + .field("bits", &self.bits) + .field("dict_bytes_len", &self.dict_bytes.len()) + .finish() + } +} + +impl ArrayHash for OnPairData { + fn array_hash(&self, state: &mut H, precision: Precision) { + self.dict_bytes.as_host().array_hash(state, precision); + state.write_u32(self.bits); + } +} + +impl ArrayEq for OnPairData { + fn array_eq(&self, other: &Self, precision: Precision) -> bool { + self.bits == other.bits + && self + .dict_bytes + .as_host() + .array_eq(other.dict_bytes.as_host(), precision) + } +} + +/// Zero-sized VTable marker for the OnPair encoding. +#[derive(Clone, Debug)] +pub struct OnPair; + +impl OnPair { + /// Build an [`OnPairArray`] from already-materialised parts. + #[expect(clippy::too_many_arguments, reason = "every child is a real input")] + pub fn try_new( + dtype: DType, + dict_bytes: BufferHandle, + dict_offsets: ArrayRef, + codes: ArrayRef, + codes_offsets: ArrayRef, + uncompressed_lengths: ArrayRef, + validity: Validity, + bits: u32, + ) -> VortexResult { + validate_parts( + &dtype, + &dict_offsets, + &codes, + &codes_offsets, + &uncompressed_lengths, + bits, + )?; + let len = uncompressed_lengths.len(); + let data = OnPairData::new(dict_bytes, bits, len); + let slots: ArraySlots = smallvec![ + Some(dict_offsets), + Some(codes), + Some(codes_offsets), + Some(uncompressed_lengths), + validity_to_child(&validity, len), + ]; + Ok(unsafe { + Array::from_parts_unchecked(ArrayParts::new(OnPair, dtype, len, data).with_slots(slots)) + }) + } + + #[expect(clippy::too_many_arguments, reason = "every child is a real input")] + pub(crate) unsafe fn new_unchecked( + dtype: DType, + dict_bytes: BufferHandle, + dict_offsets: ArrayRef, + codes: ArrayRef, + codes_offsets: ArrayRef, + uncompressed_lengths: ArrayRef, + validity: Validity, + bits: u32, + ) -> OnPairArray { + let len = uncompressed_lengths.len(); + let data = OnPairData::new(dict_bytes, bits, len); + let slots: ArraySlots = smallvec![ + Some(dict_offsets), + Some(codes), + Some(codes_offsets), + Some(uncompressed_lengths), + validity_to_child(&validity, len), + ]; + unsafe { + Array::from_parts_unchecked(ArrayParts::new(OnPair, dtype, len, data).with_slots(slots)) + } + } +} + +fn validate_parts( + dtype: &DType, + dict_offsets: &ArrayRef, + codes: &ArrayRef, + codes_offsets: &ArrayRef, + uncompressed_lengths: &ArrayRef, + bits: u32, +) -> VortexResult<()> { + vortex_ensure!( + matches!(dtype, DType::Binary(_) | DType::Utf8(_)), + "OnPair arrays must be Binary or Utf8, found {dtype}" + ); + vortex_ensure!((9..=16).contains(&bits), "bits {bits} out of range [9, 16]"); + + if !dict_offsets.dtype().is_int() || dict_offsets.dtype().is_nullable() { + vortex_bail!(InvalidArgument: "dict_offsets must be non-nullable integer"); + } + if !codes.dtype().is_int() || codes.dtype().is_nullable() { + vortex_bail!(InvalidArgument: "codes must be non-nullable integer"); + } + if !codes_offsets.dtype().is_int() || codes_offsets.dtype().is_nullable() { + vortex_bail!(InvalidArgument: "codes_offsets must be non-nullable integer"); + } + if !uncompressed_lengths.dtype().is_int() || uncompressed_lengths.dtype().is_nullable() { + vortex_bail!(InvalidArgument: "uncompressed_lengths must be non-nullable integer"); + } + if codes_offsets.len() != uncompressed_lengths.len() + 1 { + vortex_bail!(InvalidArgument: + "codes_offsets.len ({}) != uncompressed_lengths.len + 1 ({})", + codes_offsets.len(), + uncompressed_lengths.len() + 1 + ); + } + Ok(()) +} + +impl VTable for OnPair { + type TypedArrayData = OnPairData; + type OperationsVTable = Self; + type ValidityVTable = Self; + + fn id(&self) -> ArrayId { + static ID: CachedId = CachedId::new("vortex.onpair"); + *ID + } + + fn validate( + &self, + data: &Self::TypedArrayData, + dtype: &DType, + len: usize, + slots: &[Option], + ) -> VortexResult<()> { + let dict_offsets = slots[DICT_OFFSETS_SLOT] + .as_ref() + .ok_or_else(|| vortex_err!("OnPairArray dict_offsets slot missing"))?; + let codes = slots[CODES_SLOT] + .as_ref() + .ok_or_else(|| vortex_err!("OnPairArray codes slot missing"))?; + let codes_offsets = slots[CODES_OFFSETS_SLOT] + .as_ref() + .ok_or_else(|| vortex_err!("OnPairArray codes_offsets slot missing"))?; + let uncompressed_lengths = slots[UNCOMPRESSED_LENGTHS_SLOT] + .as_ref() + .ok_or_else(|| vortex_err!("OnPairArray uncompressed_lengths slot missing"))?; + validate_parts( + dtype, + dict_offsets, + codes, + codes_offsets, + uncompressed_lengths, + data.bits, + )?; + if uncompressed_lengths.len() != len { + vortex_bail!(InvalidArgument: "uncompressed_lengths must have same len as outer array"); + } + if data.len != len { + vortex_bail!(InvalidArgument: "OnPairData len {} != outer len {}", data.len, len); + } + Ok(()) + } + + fn nbuffers(_array: ArrayView<'_, Self>) -> usize { + 1 + } + + fn buffer(array: ArrayView<'_, Self>, idx: usize) -> BufferHandle { + match idx { + 0 => array.dict_bytes_handle().clone(), + _ => vortex_panic!("OnPairArray buffer index {idx} out of bounds"), + } + } + + fn buffer_name(_array: ArrayView<'_, Self>, idx: usize) -> Option { + match idx { + 0 => Some("dict_bytes".to_string()), + _ => vortex_panic!("OnPairArray buffer_name index {idx} out of bounds"), + } + } + + fn serialize( + array: ArrayView<'_, Self>, + _session: &VortexSession, + ) -> VortexResult>> { + let dict_size = array.dict_offsets().len().saturating_sub(1) as u64; + let total_tokens = array.codes().len() as u64; + Ok(Some( + OnPairMetadata { + uncompressed_lengths_ptype: array.uncompressed_lengths().dtype().as_ptype().into(), + bits: array.bits(), + dict_size, + total_tokens, + dict_offsets_ptype: array.dict_offsets().dtype().as_ptype().into(), + codes_ptype: array.codes().dtype().as_ptype().into(), + codes_offsets_ptype: array.codes_offsets().dtype().as_ptype().into(), + } + .encode_to_vec(), + )) + } + + fn deserialize( + &self, + dtype: &DType, + len: usize, + metadata: &[u8], + buffers: &[BufferHandle], + children: &dyn ArrayChildren, + _session: &VortexSession, + ) -> VortexResult> { + if buffers.len() != 1 { + vortex_bail!(InvalidArgument: "Expected 1 buffer, got {}", buffers.len()); + } + let metadata = OnPairMetadata::decode(metadata)?; + let uncompressed_ptype = metadata.get_uncompressed_lengths_ptype()?; + + // Slot children. We pass `usize::MAX` for slots whose length we + // don't know up front (`dict_offsets` and `codes`). `codes_offsets` + // has known length `len + 1`. + let dict_offsets_len = usize::try_from(metadata.dict_size + 1) + .map_err(|_| vortex_err!("dict_size {} overflows usize", metadata.dict_size))?; + let total_tokens = usize::try_from(metadata.total_tokens) + .map_err(|_| vortex_err!("total_tokens {} overflows usize", metadata.total_tokens))?; + // The cascading compressor may have narrowed any of these integer + // children to a tighter ptype; the recorded ptype tells the framework + // exactly which dtype to materialise as. + let dict_offsets_ptype = PType::try_from(metadata.dict_offsets_ptype).map_err(|_| { + vortex_err!("invalid dict_offsets_ptype {}", metadata.dict_offsets_ptype) + })?; + let codes_ptype = PType::try_from(metadata.codes_ptype) + .map_err(|_| vortex_err!("invalid codes_ptype {}", metadata.codes_ptype))?; + let codes_offsets_ptype = PType::try_from(metadata.codes_offsets_ptype).map_err(|_| { + vortex_err!( + "invalid codes_offsets_ptype {}", + metadata.codes_offsets_ptype + ) + })?; + let dict_offsets = children.get( + 0, + &DType::Primitive(dict_offsets_ptype, Nullability::NonNullable), + dict_offsets_len, + )?; + let codes = children.get( + 1, + &DType::Primitive(codes_ptype, Nullability::NonNullable), + total_tokens, + )?; + let codes_offsets = children.get( + 2, + &DType::Primitive(codes_offsets_ptype, Nullability::NonNullable), + len + 1, + )?; + let uncompressed_lengths = children.get( + 3, + &DType::Primitive(uncompressed_ptype, Nullability::NonNullable), + len, + )?; + let validity = match children.len() { + 4 => Validity::from(dtype.nullability()), + 5 => Validity::Array(children.get(4, &Validity::DTYPE, len)?), + other => vortex_bail!(InvalidArgument: "Expected 4 or 5 children, got {other}"), + }; + + let data = OnPairData::new(buffers[0].clone(), metadata.bits, len); + let slots: ArraySlots = smallvec![ + Some(dict_offsets), + Some(codes), + Some(codes_offsets), + Some(uncompressed_lengths), + validity_to_child(&validity, len), + ]; + Ok(ArrayParts::new(self.clone(), dtype.clone(), len, data).with_slots(slots)) + } + + fn slot_name(_array: ArrayView<'_, Self>, idx: usize) -> String { + SLOT_NAMES[idx].to_string() + } + + fn execute(array: Array, ctx: &mut ExecutionCtx) -> VortexResult { + canonicalize_onpair(array.as_view(), ctx).map(ExecutionResult::done) + } + + fn append_to_builder( + array: ArrayView<'_, Self>, + builder: &mut dyn ArrayBuilder, + ctx: &mut ExecutionCtx, + ) -> VortexResult<()> { + let Some(builder) = builder.as_any_mut().downcast_mut::() else { + builder.extend_from_array( + &array + .array() + .clone() + .execute::(ctx)? + .into_array(), + ); + return Ok(()); + }; + + let next_buffer_index = builder.completed_block_count() + u32::from(builder.in_progress()); + let (buffers, views) = onpair_decode_views(array, next_buffer_index, ctx)?; + builder.push_buffer_and_adjusted_views( + &buffers, + &views, + array + .array() + .validity()? + .execute_mask(array.array().len(), ctx)?, + ); + Ok(()) + } + + fn execute_parent( + array: ArrayView<'_, Self>, + parent: &ArrayRef, + child_idx: usize, + ctx: &mut ExecutionCtx, + ) -> VortexResult> { + PARENT_KERNELS.execute(array, parent, child_idx, ctx) + } + + fn reduce_parent( + array: ArrayView<'_, Self>, + parent: &ArrayRef, + child_idx: usize, + ) -> VortexResult> { + RULES.evaluate(array, parent, child_idx) + } +} + +impl ValidityVTable for OnPair { + fn validity(array: ArrayView<'_, OnPair>) -> VortexResult { + Ok(child_to_validity( + array.slots()[VALIDITY_SLOT].as_ref(), + array.dtype().nullability(), + )) + } +} + +/// Convenience extension trait. Slot accessors live here; methods reachable +/// through `OnPairData` flow via the `ArrayView -> Deref` chain. +pub trait OnPairArrayExt: TypedArrayRef { + fn dict_offsets(&self) -> &ArrayRef { + self.as_ref().slots()[DICT_OFFSETS_SLOT] + .as_ref() + .unwrap_or_else(|| vortex_panic!("OnPairArray dict_offsets slot missing")) + } + fn codes(&self) -> &ArrayRef { + self.as_ref().slots()[CODES_SLOT] + .as_ref() + .unwrap_or_else(|| vortex_panic!("OnPairArray codes slot missing")) + } + fn codes_offsets(&self) -> &ArrayRef { + self.as_ref().slots()[CODES_OFFSETS_SLOT] + .as_ref() + .unwrap_or_else(|| vortex_panic!("OnPairArray codes_offsets slot missing")) + } + fn uncompressed_lengths(&self) -> &ArrayRef { + self.as_ref().slots()[UNCOMPRESSED_LENGTHS_SLOT] + .as_ref() + .unwrap_or_else(|| vortex_panic!("OnPairArray uncompressed_lengths slot missing")) + } + fn array_validity(&self) -> Validity { + child_to_validity( + self.as_ref().slots()[VALIDITY_SLOT].as_ref(), + self.as_ref().dtype().nullability(), + ) + } +} + +impl> OnPairArrayExt for T {} diff --git a/encodings/experimental/onpair/src/canonical.rs b/encodings/experimental/onpair/src/canonical.rs new file mode 100644 index 00000000000..368c5ab0b7a --- /dev/null +++ b/encodings/experimental/onpair/src/canonical.rs @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Convert an [`OnPairArray`] to its canonical `VarBinViewArray` by running +//! the pure-Rust dictionary-lookup decoder over every row. + +use std::sync::Arc; + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::varbinview::build_views::BinaryView; +use vortex_array::arrays::varbinview::build_views::MAX_BUFFER_LEN; +use vortex_array::arrays::varbinview::build_views::build_views; +use vortex_array::match_each_integer_ptype; +use vortex_buffer::Buffer; +use vortex_buffer::ByteBuffer; +use vortex_buffer::ByteBufferMut; +use vortex_error::VortexResult; + +use crate::OnPair; +use crate::OnPairArrayExt; +use crate::decode::OwnedDecodeInputs; + +pub(super) fn canonicalize_onpair( + array: ArrayView<'_, OnPair>, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let (buffers, views) = onpair_decode_views(array, 0, ctx)?; + let validity = array.array().validity()?; + Ok(unsafe { + VarBinViewArray::new_unchecked(views, Arc::from(buffers), array.dtype().clone(), validity) + .into_array() + }) +} + +pub(crate) fn onpair_decode_views( + array: ArrayView<'_, OnPair>, + start_buf_index: u32, + ctx: &mut ExecutionCtx, +) -> VortexResult<(Vec, Buffer)> { + let n = array.array().len(); + let lengths = array + .uncompressed_lengths() + .clone() + .execute::(ctx)?; + + #[expect(clippy::cast_possible_truncation)] + let total_size: usize = match_each_integer_ptype!(lengths.ptype(), |P| { + lengths.as_slice::

().iter().map(|x| *x as usize).sum() + }); + + let inputs = OwnedDecodeInputs::collect(array, ctx)?; + let dv = inputs.view(); + // Decode directly into the canonical output buffer's spare capacity — + // no temporary `Vec` + `extend_from_slice` round-trip. Total size + // is already known from `uncompressed_lengths`, so we can size the + // buffer once with the over-copy slack and call into the unchecked + // single-pass decoder. + let mut out_bytes = ByteBufferMut::with_capacity(total_size + crate::MAX_TOKEN_SIZE); + // SAFETY: + // * `out_bytes` reserved at least `total_size + MAX_TOKEN_SIZE` bytes + // above; `decode_rows_unchecked` may over-copy up to MAX_TOKEN_SIZE + // bytes past the true end, all within reserved capacity. + // * Caller has verified the array's invariants in `OnPair::try_new`, + // so every code is a valid index and `dict_bytes` is padded. + unsafe { + let dst = out_bytes.spare_capacity_mut().as_mut_ptr().cast::(); + let written = dv.decode_rows_unchecked(0, n, dst); + debug_assert_eq!(written, total_size); + out_bytes.set_len(written); + } + + match_each_integer_ptype!(lengths.ptype(), |P| { + Ok(build_views( + start_buf_index, + MAX_BUFFER_LEN, + out_bytes, + lengths.as_slice::

(), + )) + }) +} diff --git a/encodings/experimental/onpair/src/compress.rs b/encodings/experimental/onpair/src/compress.rs new file mode 100644 index 00000000000..50fa3189a0a --- /dev/null +++ b/encodings/experimental/onpair/src/compress.rs @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Train + compress entry points for the OnPair encoding. + +use onpair::Column; +use onpair::Config; +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::VortexSessionExecute; +use vortex_array::accessor::ArrayAccessor; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::buffer::BufferHandle; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::validity::Validity; +use vortex_buffer::Buffer; +use vortex_buffer::BufferMut; +use vortex_buffer::ByteBuffer; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_error::vortex_err; + +use crate::OnPair; +use crate::OnPairArray; + +/// Default OnPair training configuration: 12-bit codes ("dict-12"). +pub const DEFAULT_DICT12_CONFIG: Config = onpair::DEFAULT_CONFIG; + +/// Build a training config with a custom bit width. +pub fn config_with_bits(bits: u32) -> Config { + Config { + bits, + ..onpair::DEFAULT_CONFIG + } +} + +/// Compress an iterable of optional byte strings via the OnPair C++ library. +pub fn onpair_compress_iter<'a, I>( + iter: I, + len: usize, + dtype: DType, + config: Config, +) -> VortexResult +where + I: Iterator>, +{ + let mut flat: Vec = Vec::with_capacity(len * 16); + let mut offsets: Vec = Vec::with_capacity(len + 1); + let mut uncompressed_lengths: BufferMut = BufferMut::with_capacity(len); + let mut validity_bits: Vec = Vec::with_capacity(len); + offsets.push(0); + + for item in iter { + match item { + Some(bytes) => { + flat.extend_from_slice(bytes); + offsets.push(flat.len() as u64); + uncompressed_lengths.push( + i32::try_from(bytes.len()).vortex_expect("string length must fit in i32"), + ); + validity_bits.push(true); + } + None => { + offsets.push(flat.len() as u64); + uncompressed_lengths.push(0); + validity_bits.push(false); + } + } + } + + let column = onpair::compress(&flat, &offsets, config) + .map_err(|e| vortex_err!("OnPair compress failed: {e}"))?; + let (bits, dict_bytes, dict_offsets, codes, codes_offsets) = parts_to_children(&column)?; + + let uncompressed_lengths = uncompressed_lengths.into_array(); + let validity = match dtype.nullability() { + Nullability::NonNullable => Validity::NonNullable, + Nullability::Nullable => Validity::from_iter(validity_bits), + }; + + OnPair::try_new( + dtype, + dict_bytes, + dict_offsets, + codes, + codes_offsets, + uncompressed_lengths, + validity, + bits, + ) +} + +/// Lift a compressed [`Column`] into Vortex children + the dict buffer. +/// Returns `(bits, dict_bytes_buffer, dict_offsets_child, codes_child, codes_offsets_child)`. +fn parts_to_children( + column: &Column, +) -> VortexResult<(u32, BufferHandle, ArrayRef, ArrayRef, ArrayRef)> { + let bits = column.bits; + // Pad the dictionary blob with MAX_TOKEN_SIZE zero bytes so the + // over-copy decoder can issue a fixed 16-byte load for every token + // without risking an OOB read on the last entry. + let mut padded = Vec::with_capacity(column.dict_bytes.len() + crate::MAX_TOKEN_SIZE); + padded.extend_from_slice(&column.dict_bytes); + padded.resize(column.dict_bytes.len() + crate::MAX_TOKEN_SIZE, 0); + // Align dict_bytes to 8 bytes so the segment that ultimately holds the + // OnPair tree starts at an 8-aligned in-memory address. Without this + // anchor, the per-buffer padding the serializer inserts is only + // *relative* to the segment start; if the segment lands at a u8-aligned + // heap address, downstream `PrimitiveArray::deserialize` panics + // with `Misaligned buffer cannot be used to build PrimitiveArray of u32`. + let dict_bytes = + BufferHandle::new_host(ByteBuffer::from(padded).aligned(vortex_buffer::Alignment::new(8))); + + let dict_offsets = Buffer::::copy_from(column.dict_offsets.as_slice()).into_array(); + // The crate emits already-unpacked token codes (one `u16` per token), so + // they map straight onto the `codes` slot child. + let codes = Buffer::::copy_from(column.codes.as_slice()).into_array(); + // Per-row boundaries are `u64`; the array stores them as `u32`. Token + // counts comfortably fit `u32` for any single chunk. + let codes_offsets: Vec = column + .code_boundaries + .iter() + .map(|&b| { + u32::try_from(b).map_err(|_| vortex_err!("OnPair: code boundary {b} does not fit u32")) + }) + .collect::>()?; + let codes_offsets = Buffer::::copy_from(codes_offsets).into_array(); + Ok((bits, dict_bytes, dict_offsets, codes, codes_offsets)) +} + +/// Compress a byte-string accessor (typically a `VarBinArray` or +/// `VarBinViewArray`). +pub fn onpair_compress>( + array: A, + len: usize, + dtype: &DType, + config: Config, +) -> VortexResult { + array.with_iterator(|iter| onpair_compress_iter(iter, len, dtype.clone(), config)) +} + +/// Compress any [`ArrayRef`] whose canonical form is a string array, by first +/// canonicalising to `VarBinViewArray`. +pub fn onpair_compress_array( + array: &ArrayRef, + config: Config, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let view = array.clone().execute::(ctx)?; + let len = view.len(); + let dtype = view.dtype().clone(); + onpair_compress(&view, len, &dtype, config) +} + +/// Convenience: build a default `ExecutionCtx` from `LEGACY_SESSION`. +pub fn onpair_compress_array_default( + array: &ArrayRef, + config: Config, +) -> VortexResult { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + onpair_compress_array(array, config, &mut ctx) +} diff --git a/encodings/experimental/onpair/src/compute/cast.rs b/encodings/experimental/onpair/src/compute/cast.rs new file mode 100644 index 00000000000..27b4ad378c7 --- /dev/null +++ b/encodings/experimental/onpair/src/compute/cast.rs @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::dtype::DType; +use vortex_array::scalar_fn::fns::cast::CastKernel; +use vortex_array::scalar_fn::fns::cast::CastReduce; +use vortex_error::VortexResult; + +use crate::OnPair; +use crate::OnPairArrayExt; + +/// Cast between `Utf8` and `Binary` (or adjust nullability) without touching +/// any of the encoded payload — we only rewrap into a new outer DType. +impl CastReduce for OnPair { + fn cast(array: ArrayView<'_, Self>, dtype: &DType) -> VortexResult> { + if !array.dtype().eq_ignore_nullability(dtype) { + return Ok(None); + } + let validity = array.array().validity()?; + let Some(new_validity) = + validity.trivially_cast_nullability(dtype.nullability(), array.array().len())? + else { + return Ok(None); + }; + Ok(Some( + unsafe { + OnPair::new_unchecked( + dtype.clone(), + array.dict_bytes_handle().clone(), + array.dict_offsets().clone(), + array.codes().clone(), + array.codes_offsets().clone(), + array.uncompressed_lengths().clone(), + new_validity, + array.bits(), + ) + } + .into_array(), + )) + } +} + +impl CastKernel for OnPair { + fn cast( + array: ArrayView<'_, Self>, + dtype: &DType, + _ctx: &mut ExecutionCtx, + ) -> VortexResult> { + ::cast(array, dtype) + } +} diff --git a/encodings/experimental/onpair/src/compute/filter.rs b/encodings/experimental/onpair/src/compute/filter.rs new file mode 100644 index 00000000000..fbece54c4bb --- /dev/null +++ b/encodings/experimental/onpair/src/compute/filter.rs @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Filter that **shares the dictionary**. The previous implementation +//! decoded the whole array, filtered the canonical bytes, and re-trained +//! a brand-new OnPair dictionary on the surviving rows — order-of- +//! magnitude regressions on TPC-H Q22 at SF=10 traced back to that cost +//! (the customer table's `c_phone` column gets two consecutive filters, +//! each of which was paying full `Column::compress` training overhead). +//! +//! FSST-shape filter: keep `dict_bytes` + `dict_offsets` **identical** +//! to the input; rebuild only `codes`, `codes_offsets`, +//! `uncompressed_lengths`, and validity by walking the mask. No decode, +//! no retrain, no C++ call on the read path. + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::filter::FilterKernel; +use vortex_array::match_each_integer_ptype; +use vortex_buffer::BufferMut; +use vortex_error::VortexResult; +use vortex_error::vortex_err; +use vortex_mask::Mask; + +use crate::OnPair; +use crate::OnPairArrayExt; + +impl FilterKernel for OnPair { + #[expect(clippy::cognitive_complexity, clippy::cast_possible_truncation)] + fn filter( + array: ArrayView<'_, Self>, + mask: &Mask, + ctx: &mut ExecutionCtx, + ) -> VortexResult> { + let n_in = array.array().len(); + let n_out = mask.true_count(); + + // Materialise the per-row offset arrays we walk during filtering. + // The codes themselves we read through whatever ptype the + // cascading compressor narrowed to — match_each_integer_ptype + // dispatches on it below. + let codes_offsets_arr = array + .codes_offsets() + .clone() + .execute::(ctx)?; + let codes_arr = array.codes().clone().execute::(ctx)?; + + let mut new_codes_offsets = BufferMut::::with_capacity(n_out + 1); + + // The cascading compressor may have narrowed `codes_offsets` + // (e.g. u32 → u16 if every row's token count is small). Read + // through whatever ptype it lives at — the values still fit in + // `usize` when widened. Likewise for `codes`. + let new_codes: ArrayRef = match_each_integer_ptype!(codes_offsets_arr.ptype(), |OP| { + let codes_offsets = codes_offsets_arr.as_slice::(); + + // First pass: sum the surviving token count so we reserve once. + let mut new_codes_len: usize = 0; + for r in 0..n_in { + if mask.value(r) { + new_codes_len += (codes_offsets[r + 1] as usize) - (codes_offsets[r] as usize); + } + } + + // SAFETY: capacity reserved. + unsafe { new_codes_offsets.push_unchecked(0u32) }; + + match_each_integer_ptype!(codes_arr.ptype(), |P| { + let codes = codes_arr.as_slice::

(); + let mut out = BufferMut::

::with_capacity(new_codes_len); + let mut cursor: u32 = 0; + for r in 0..n_in { + if mask.value(r) { + let lo = codes_offsets[r] as usize; + let hi = codes_offsets[r + 1] as usize; + // SAFETY: codes_offsets validated at construction. + let segment = unsafe { codes.get_unchecked(lo..hi) }; + out.extend_from_slice(segment); + let segment_len = u32::try_from(hi - lo) + .map_err(|_| vortex_err!("token segment overflows u32"))?; + cursor = cursor + .checked_add(segment_len) + .ok_or_else(|| vortex_err!("codes_offsets overflow u32"))?; + // SAFETY: capacity reserved (n_out + 1 entries). + unsafe { new_codes_offsets.push_unchecked(cursor) }; + } + } + out.freeze().into_array() + }) + }); + + // uncompressed_lengths + validity flow through the standard + // primitive filter — these are short integer arrays so the cost + // is negligible compared to the (avoided) recompress. + let uncompressed_lengths = array.uncompressed_lengths().clone().filter(mask.clone())?; + let validity = array.array_validity().filter(mask)?; + + Ok(Some( + unsafe { + OnPair::new_unchecked( + array.dtype().clone(), + array.dict_bytes_handle().clone(), + array.dict_offsets().clone(), + new_codes, + new_codes_offsets.freeze().into_array(), + uncompressed_lengths, + validity, + array.bits(), + ) + } + .into_array(), + )) + } +} diff --git a/encodings/experimental/onpair/src/compute/mod.rs b/encodings/experimental/onpair/src/compute/mod.rs new file mode 100644 index 00000000000..e33c49b80f1 --- /dev/null +++ b/encodings/experimental/onpair/src/compute/mod.rs @@ -0,0 +1,5 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +mod cast; +mod filter; diff --git a/encodings/experimental/onpair/src/decode.rs b/encodings/experimental/onpair/src/decode.rs new file mode 100644 index 00000000000..dd434811d06 --- /dev/null +++ b/encodings/experimental/onpair/src/decode.rs @@ -0,0 +1,347 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Pure-Rust decoder for an [`OnPair`][crate::OnPair] array. +//! +//! The decode loop is intentionally simple — one `u16` code load, one +//! `u64` table load, one fixed 16-byte over-copy `memcpy` — so the +//! autovectoriser keeps the hot path SIMD-friendly. We materialise the +//! children once into native-aligned `Buffer`s (and pack the dict +//! offsets + lengths into a single `Buffer` lookup table) so the +//! inner loop indexes straight into raw slices with no branches. + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::dtype::PType; +use vortex_array::match_each_integer_ptype; +use vortex_buffer::Buffer; +use vortex_buffer::BufferMut; +use vortex_buffer::ByteBuffer; +use vortex_error::VortexResult; + +use crate::OnPair; +use crate::OnPairArrayExt; + +/// Materialised, host-resident copies of every read path's input. +/// +/// Each integer child (`dict_offsets`, `codes`, `codes_offsets`) is a slot +/// on the outer `OnPair` array, possibly wrapped in a non-canonical +/// encoding the cascading compressor chose (e.g. FastLanes-bit-packed +/// `codes`, `narrow`-ed dict offsets). `execute::` may +/// hand us back a narrower ptype than the decode loop wants. `collect` +/// widens each child to the decoder's native width (`u32` for both offset +/// arrays, `u16` for codes) once so the inner loop is branch-free pointer +/// arithmetic. +/// +/// Construction also packs `dict_offsets` into the combined +/// `(offset << 16) | length` `dict_table` so the decode hot loop loads a +/// single `u64` per token instead of two adjacent `u32`s. +pub struct OwnedDecodeInputs { + pub dict_bytes: ByteBuffer, + /// `(dict_offset << 16) | dict_len` per token. `dict_len` ≤ + /// `MAX_TOKEN_SIZE = 16` so 16 bits suffice. + pub dict_table: Buffer, + pub codes: Buffer, + pub codes_offsets: Buffer, +} + +impl OwnedDecodeInputs { + pub fn collect(array: ArrayView<'_, OnPair>, ctx: &mut ExecutionCtx) -> VortexResult { + let dict_offsets_arr = to_primitive(array.dict_offsets(), ctx)?; + let dict_table = build_dict_table(&dict_offsets_arr); + Ok(Self { + dict_bytes: array.dict_bytes().clone(), + dict_table, + codes: widen_to_u16(&to_primitive(array.codes(), ctx)?), + codes_offsets: widen_to_u32(&to_primitive(array.codes_offsets(), ctx)?), + }) + } + + pub fn view(&self) -> DecodeView<'_> { + DecodeView { + dict_bytes: self.dict_bytes.as_slice(), + dict_table: self.dict_table.as_slice(), + codes: self.codes.as_slice(), + codes_offsets: self.codes_offsets.as_slice(), + } + } +} + +/// Pack `dict_offsets` directly into `(offset << 16) | length` per token. +/// Reads through the integer-ptype macro once so we don't have to widen +/// the offsets buffer first — saves one `Vec` allocation in the common +/// (non-narrowed) case. +#[allow( + clippy::cast_lossless, + clippy::cast_possible_truncation, + clippy::cast_sign_loss, + clippy::unnecessary_cast +)] +fn build_dict_table(arr: &PrimitiveArray) -> Buffer { + match_each_integer_ptype!(arr.ptype(), |P| { + let slice = arr.as_slice::

(); + if slice.is_empty() { + return Buffer::::copy_from(Vec::::new()); + } + let dict_size = slice.len() - 1; + let mut table = BufferMut::::with_capacity(dict_size); + for i in 0..dict_size { + let off = slice[i] as u64; + let len = (slice[i + 1] - slice[i]) as u64; + // SAFETY: capacity reserved above; we push exactly dict_size times. + unsafe { table.push_unchecked((off << 16) | len) }; + } + table.freeze() + }) +} + +fn to_primitive(arr: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { + arr.clone().execute::(ctx) +} + +/// Widen any integer-typed `PrimitiveArray` to `Buffer`. When the +/// underlying ptype already matches we transmute the buffer instead of +/// allocating a new one. Used when the cascading compressor narrowed an +/// offset array (e.g. `u32` → `u16`). +#[allow( + clippy::cast_lossless, + clippy::cast_possible_truncation, + clippy::cast_sign_loss, + clippy::unnecessary_cast +)] +fn widen_to_u32(arr: &PrimitiveArray) -> Buffer { + if arr.ptype() == PType::U32 { + // Cheap: PrimitiveArray's underlying buffer is Arc-shared, so + // `into_buffer` on a clone is effectively a refcount bump. + return arr.clone().into_buffer::(); + } + match_each_integer_ptype!(arr.ptype(), |P| { + let slice = arr.as_slice::

(); + let mut out = BufferMut::::with_capacity(slice.len()); + for &v in slice { + // SAFETY: capacity reserved above. + unsafe { out.push_unchecked(v as u32) }; + } + out.freeze() + }) +} + +/// As `widen_to_u32` but for `Buffer`. +#[allow( + clippy::cast_lossless, + clippy::cast_possible_truncation, + clippy::cast_sign_loss, + clippy::unnecessary_cast +)] +fn widen_to_u16(arr: &PrimitiveArray) -> Buffer { + if arr.ptype() == PType::U16 { + return arr.clone().into_buffer::(); + } + match_each_integer_ptype!(arr.ptype(), |P| { + let slice = arr.as_slice::

(); + let mut out = BufferMut::::with_capacity(slice.len()); + for &v in slice { + // SAFETY: capacity reserved above. + unsafe { out.push_unchecked(v as u16) }; + } + out.freeze() + }) +} + +/// Borrowed slices for the decode loop. +#[derive(Copy, Clone)] +pub struct DecodeView<'a> { + pub dict_bytes: &'a [u8], + pub dict_table: &'a [u64], + pub codes: &'a [u16], + pub codes_offsets: &'a [u32], +} + +impl<'a> DecodeView<'a> { + /// Decode row `row` into `out` (appended). Thin wrapper around + /// [`Self::decode_rows_into`]. + #[inline] + pub fn decode_row_into(&self, row: usize, out: &mut Vec) { + self.decode_rows_into(row, 1, out); + } + + /// Bulk decode rows `[start, start + count)` contiguously into `out`. + /// Pre-computes the decoded length, reserves once, then delegates to + /// the unrolled fast path. Callers that already know the size (e.g. + /// canonicalize from `uncompressed_lengths`) should call + /// [`Self::decode_rows_into_with_size`] to skip the size pre-pass. + pub fn decode_rows_into(&self, start: usize, count: usize, out: &mut Vec) { + if count == 0 { + return; + } + let decoded_len = self.decoded_len_rows(start, count); + let written_start = out.len(); + out.reserve(decoded_len + crate::MAX_TOKEN_SIZE); + // SAFETY: capacity reserved above; `decode_rows_unchecked`'s + // invariants are upheld by the [`OnPair::try_new`] validation. + unsafe { + let written = + self.decode_rows_unchecked(start, count, out.as_mut_ptr().add(written_start)); + debug_assert_eq!(written, decoded_len); + out.set_len(written_start + written); + } + } + + /// Single-pass over-copy decode of a token window into raw `dst`. + /// + /// Mirrors OnPair C++ `decode_all` (and `decompress`) + /// exactly: each iteration loads one `u16` code, one `u64` dict-table + /// entry, issues a fixed [`MAX_TOKEN_SIZE`][crate::MAX_TOKEN_SIZE] + /// `copy_nonoverlapping` (which LLVM lowers to a single unaligned + /// 128-bit SIMD store on x86_64 / aarch64), and advances the cursor by + /// the *true* token length. The body is hand-unrolled four times so + /// the CPU can keep four independent stores in flight, matching the + /// `ONPAIR_EMIT4` block of the upstream `decode_all.h`. + /// + /// Returns the number of *true* bytes written. + /// + /// # Safety + /// * `dst` must point into a region with at least + /// `decoded_byte_length + MAX_TOKEN_SIZE` bytes of writable + /// uninitialised capacity. + /// * `self.dict_bytes` must have at least `MAX_TOKEN_SIZE` trailing + /// pad bytes past the last real token byte (`compress.rs` enforces + /// this). + /// * Every `code` in the window must be `< self.dict_table.len()`. + #[inline] + pub unsafe fn decode_rows_unchecked(&self, start: usize, count: usize, dst: *mut u8) -> usize { + if count == 0 { + return 0; + } + // SAFETY: caller invariants. + let lo = unsafe { *self.codes_offsets.get_unchecked(start) } as usize; + let hi = unsafe { *self.codes_offsets.get_unchecked(start + count) } as usize; + + let codes_ptr = self.codes.as_ptr(); + let table_ptr = self.dict_table.as_ptr(); + let dict_ptr = self.dict_bytes.as_ptr(); + + let mut cursor = dst; + let unroll_end = lo + ((hi - lo) & !3); + let mut i = lo; + // SAFETY: indices derived from validated offsets; the 16-byte + // over-copy reads stay within `dict_bytes`'s trailing pad; writes + // stay within the caller-promised capacity. + unsafe { + while i < unroll_end { + macro_rules! emit { + ($k:expr) => {{ + let c = *codes_ptr.add(i + $k) as usize; + let entry = *table_ptr.add(c); + let off = (entry >> 16) as usize; + let len = (entry & 0xffff) as usize; + std::ptr::copy_nonoverlapping( + dict_ptr.add(off), + cursor, + crate::MAX_TOKEN_SIZE, + ); + cursor = cursor.add(len); + }}; + } + emit!(0); + emit!(1); + emit!(2); + emit!(3); + i += 4; + } + while i < hi { + let c = *codes_ptr.add(i) as usize; + let entry = *table_ptr.add(c); + let off = (entry >> 16) as usize; + let len = (entry & 0xffff) as usize; + std::ptr::copy_nonoverlapping(dict_ptr.add(off), cursor, crate::MAX_TOKEN_SIZE); + cursor = cursor.add(len); + i += 1; + } + cursor.offset_from(dst) as usize + } + } + + /// Single-pass decode when the caller already knows the total decoded + /// byte length (e.g. from summing `uncompressed_lengths`). Skips the + /// size-precomputation pass. + /// + /// # Safety + /// `out.capacity() - out.len() >= total_size + MAX_TOKEN_SIZE` and + /// `total_size` equals the true decoded length. + #[inline] + pub unsafe fn decode_rows_into_with_size( + &self, + start: usize, + count: usize, + total_size: usize, + out: &mut Vec, + ) { + let written_start = out.len(); + debug_assert!(out.capacity() - written_start >= total_size + crate::MAX_TOKEN_SIZE); + // SAFETY: caller's invariants. + let written = unsafe { + self.decode_rows_unchecked(start, count, out.as_mut_ptr().add(written_start)) + }; + debug_assert_eq!(written, total_size); + // SAFETY: `written` ≤ reserved capacity (caller invariants). + unsafe { out.set_len(written_start + written) }; + } + + /// Decoded byte length of row `row` without copying any bytes. + #[inline] + pub fn decoded_len(&self, row: usize) -> usize { + self.decoded_len_rows(row, 1) + } + + /// Decoded byte length of rows `[start, start + count)`. Uses the + /// combined `dict_table` — one `u64` load per token. + #[inline] + pub fn decoded_len_rows(&self, start: usize, count: usize) -> usize { + if count == 0 { + return 0; + } + let lo = self.codes_offsets[start] as usize; + let hi = self.codes_offsets[start + count] as usize; + let mut total = 0usize; + // SAFETY: bounds checked by indexing above. + unsafe { + for i in lo..hi { + let c = *self.codes.get_unchecked(i) as usize; + total += (*self.dict_table.get_unchecked(c) & 0xffff) as usize; + } + } + total + } + + /// Iterate the decoded bytes of `row` without materialising the full + /// row, calling `f` on each contiguous dict slice. Returns + /// + /// * `true` if every slice was visited (i.e. `f` always returned + /// `true`), + /// * `false` if `f` short-circuited with `false`. + /// + /// Useful for predicates that can short-circuit, e.g. `equals` and + /// `starts_with`. + #[inline] + pub fn for_each_dict_slice bool>(&self, row: usize, mut f: F) -> bool { + let lo = self.codes_offsets[row] as usize; + let hi = self.codes_offsets[row + 1] as usize; + let codes = &self.codes[lo..hi]; + // SAFETY: codes were validated at construction time. + unsafe { + for &c in codes { + let entry = *self.dict_table.get_unchecked(c as usize); + let off = (entry >> 16) as usize; + let len = (entry & 0xffff) as usize; + let slice = self.dict_bytes.get_unchecked(off..off + len); + if !f(slice) { + return false; + } + } + } + true + } +} diff --git a/encodings/experimental/onpair/src/kernel.rs b/encodings/experimental/onpair/src/kernel.rs new file mode 100644 index 00000000000..e8c891f5875 --- /dev/null +++ b/encodings/experimental/onpair/src/kernel.rs @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::arrays::filter::FilterExecuteAdaptor; +use vortex_array::kernel::ParentKernelSet; +use vortex_array::scalar_fn::fns::cast::CastExecuteAdaptor; + +use crate::OnPair; + +// TODO: implement TakeExecute for OnPair to add a TakeExecuteAdaptor here +// (matches the FSST pattern; would dispatch take on the codes child + reuse +// the dictionary, mirroring the slice path). +pub(super) const PARENT_KERNELS: ParentKernelSet = ParentKernelSet::new(&[ + ParentKernelSet::lift(&CastExecuteAdaptor(OnPair)), + ParentKernelSet::lift(&FilterExecuteAdaptor(OnPair)), +]); diff --git a/encodings/experimental/onpair/src/lib.rs b/encodings/experimental/onpair/src/lib.rs new file mode 100644 index 00000000000..7b247c71607 --- /dev/null +++ b/encodings/experimental/onpair/src/lib.rs @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Vortex string array backed by the [OnPair][onpair] short-string +//! compression library, with `cast` and `filter` pushdown. +//! +//! The default training preset is `dict-12` (12 bits per token, dictionary +//! capped at 4 096 entries). See [`onpair_compress`] for the entry point and +//! [`OnPairArray`] for the resulting array type. +//! +//! [onpair]: https://arxiv.org/abs/2508.02280 + +mod array; +mod canonical; +mod compress; +mod compute; +pub mod decode; +mod kernel; +mod ops; +mod rules; +mod slice; +#[cfg(test)] +mod tests; + +pub use array::*; +pub use compress::*; + +/// Fixed token-byte over-copy width. Matches the `onpair` crate's `MAX_TOKEN_SIZE`: +/// the decoder copies exactly this many bytes per token and advances the +/// output cursor by the *true* token length. Lets the compiler emit a single +/// 128-bit SIMD store per token on x86_64 / aarch64 instead of a +/// variable-length memcpy. +pub const MAX_TOKEN_SIZE: usize = 16; diff --git a/encodings/experimental/onpair/src/ops.rs b/encodings/experimental/onpair/src/ops.rs new file mode 100644 index 00000000000..55e6c77b1e0 --- /dev/null +++ b/encodings/experimental/onpair/src/ops.rs @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::arrays::varbin::varbin_scalar; +use vortex_array::scalar::Scalar; +use vortex_array::vtable::OperationsVTable; +use vortex_buffer::ByteBuffer; +use vortex_error::VortexResult; + +use crate::OnPair; +use crate::decode::OwnedDecodeInputs; + +impl OperationsVTable for OnPair { + fn scalar_at( + array: ArrayView<'_, OnPair>, + index: usize, + ctx: &mut ExecutionCtx, + ) -> VortexResult { + let inputs = OwnedDecodeInputs::collect(array, ctx)?; + let dv = inputs.view(); + let mut buf: Vec = Vec::with_capacity(dv.decoded_len(index)); + dv.decode_row_into(index, &mut buf); + Ok(varbin_scalar(ByteBuffer::from(buf), array.dtype())) + } +} diff --git a/encodings/experimental/onpair/src/rules.rs b/encodings/experimental/onpair/src/rules.rs new file mode 100644 index 00000000000..279c160c1eb --- /dev/null +++ b/encodings/experimental/onpair/src/rules.rs @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::arrays::slice::SliceReduceAdaptor; +use vortex_array::optimizer::rules::ParentRuleSet; +use vortex_array::scalar_fn::fns::cast::CastReduceAdaptor; + +use crate::OnPair; + +pub(crate) static RULES: ParentRuleSet = ParentRuleSet::new(&[ + ParentRuleSet::lift(&SliceReduceAdaptor(OnPair)), + ParentRuleSet::lift(&CastReduceAdaptor(OnPair)), +]); diff --git a/encodings/experimental/onpair/src/slice.rs b/encodings/experimental/onpair/src/slice.rs new file mode 100644 index 00000000000..48f3d6b8d16 --- /dev/null +++ b/encodings/experimental/onpair/src/slice.rs @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Slicing an `OnPairArray` reuses the same dictionary blob, the full +//! `codes` child, and the full `dict_offsets` child. Only the +//! `codes_offsets` child (narrowed to `[start, end + 1)`), the +//! `uncompressed_lengths` child (narrowed to `[start, end)`) and the +//! optional validity child change. No decode, no re-training. + +use std::ops::Range; + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::IntoArray; +use vortex_array::arrays::slice::SliceReduce; +use vortex_error::VortexResult; + +use crate::OnPair; +use crate::OnPairArrayExt; + +impl SliceReduce for OnPair { + fn slice(array: ArrayView<'_, Self>, range: Range) -> VortexResult> { + let codes_offsets = array.codes_offsets().slice(range.start..range.end + 1)?; + let uncompressed_lengths = array.uncompressed_lengths().slice(range.clone())?; + let validity = array.array_validity().slice(range)?; + Ok(Some( + unsafe { + OnPair::new_unchecked( + array.dtype().clone(), + array.dict_bytes_handle().clone(), + array.dict_offsets().clone(), + array.codes().clone(), + codes_offsets, + uncompressed_lengths, + validity, + array.bits(), + ) + } + .into_array(), + )) + } +} diff --git a/encodings/experimental/onpair/src/tests.rs b/encodings/experimental/onpair/src/tests.rs new file mode 100644 index 00000000000..2a2abb62d80 --- /dev/null +++ b/encodings/experimental/onpair/src/tests.rs @@ -0,0 +1,387 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::sync::LazyLock; + +use prost::Message; +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::accessor::ArrayAccessor; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::VarBinArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::filter::FilterKernel; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::match_each_integer_ptype; +use vortex_array::session::ArraySession; +use vortex_array::test_harness::check_metadata; +use vortex_array::validity::Validity; +use vortex_buffer::BufferMut; +use vortex_session::VortexSession; + +use crate::OnPair; +use crate::OnPairArrayExt; +use crate::OnPairMetadata; +use crate::compress::DEFAULT_DICT12_CONFIG; +use crate::compress::onpair_compress; + +static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + +fn sample_input() -> VarBinArray { + VarBinArray::from_iter( + [ + Some("https://www.example.com/page"), + Some("https://www.example.com/data"), + Some("https://www.test.org/page"), + Some("ftp://files.example.com/x"), + Some("https://www.example.com/page"), + ], + DType::Utf8(Nullability::NonNullable), + ) +} + +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_metadata_golden() { + check_metadata( + "onpair.metadata", + &OnPairMetadata { + uncompressed_lengths_ptype: PType::I32 as i32, + bits: 12, + dict_size: 4096, + total_tokens: 128_000, + dict_offsets_ptype: PType::U32 as i32, + codes_ptype: PType::U16 as i32, + codes_offsets_ptype: PType::U32 as i32, + } + .encode_to_vec(), + ); +} + +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_roundtrip() { + let input = sample_input(); + let len = input.len(); + let dtype = input.dtype().clone(); + + let compressed = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG).expect("compress"); + assert!(compressed.clone().into_array().is::()); + + let mut ctx = SESSION.create_execution_ctx(); + let decoded = compressed + .into_array() + .execute::(&mut ctx) + .expect("canonicalize"); + + decoded + .with_iterator(|iter| { + let got: Vec>> = iter.map(|b| b.map(|s| s.to_vec())).collect(); + assert_eq!(got.len(), 5); + assert_eq!( + got[0].as_deref(), + Some(b"https://www.example.com/page".as_ref()) + ); + assert_eq!( + got[3].as_deref(), + Some(b"ftp://files.example.com/x".as_ref()) + ); + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} + +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_nullable_canonicalize() { + let input = VarBinArray::from_iter( + [Some("a"), None, Some("bbb"), None, Some("ccccc")], + DType::Utf8(Nullability::Nullable), + ); + let len = input.len(); + let dtype = input.dtype().clone(); + let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG).unwrap(); + let mut ctx = SESSION.create_execution_ctx(); + let canonical = arr + .into_array() + .execute::(&mut ctx) + .unwrap(); + canonical + .with_iterator(|iter| { + let got: Vec>> = iter.map(|b| b.map(|s| s.to_vec())).collect(); + assert_eq!(got[1], None); + assert_eq!(got[3], None); + assert_eq!(got[4].as_deref(), Some(b"ccccc".as_ref())); + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} + +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_scalar_at() { + let input = sample_input(); + let len = input.len(); + let dtype = input.dtype().clone(); + let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG).unwrap(); + let mut ctx = SESSION.create_execution_ctx(); + let s = arr.into_array().execute_scalar(2, &mut ctx).unwrap(); + let v = s.as_utf8().value().unwrap(); + assert_eq!(v.as_bytes(), b"https://www.test.org/page"); +} + +/// The hot decode loop is 4×-unrolled with a scalar tail. Anything that +/// lands in the tail (1-3 leftover tokens, or zero total tokens) must +/// produce the same bytes as the unrolled body. Hit every row-count +/// near the boundary. +#[cfg_attr(miri, ignore)] +#[rstest::rstest] +#[case::n_1(1)] +#[case::n_2(2)] +#[case::n_3(3)] +#[case::n_4(4)] +#[case::n_5(5)] +#[case::n_7(7)] +#[case::n_8(8)] +#[case::n_9(9)] +fn test_onpair_unroll_tail_boundaries(#[case] n: usize) { + let words: &[&str] = &["a", "bb", "ccc", "https://www.example.com/x"]; + let strings: Vec<&str> = (0..n).map(|i| words[i % words.len()]).collect(); + let input = VarBinArray::from_iter( + strings.iter().map(|s| Some(*s)), + DType::Utf8(Nullability::NonNullable), + ); + let len = input.len(); + let dtype = input.dtype().clone(); + let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG).unwrap(); + let mut ctx = SESSION.create_execution_ctx(); + let canonical = arr + .into_array() + .execute::(&mut ctx) + .unwrap(); + canonical + .with_iterator(|iter| { + let got: Vec>> = iter.map(|b| b.map(|s| s.to_vec())).collect(); + assert_eq!(got.len(), n); + for (i, expected) in strings.iter().enumerate() { + assert_eq!(got[i].as_deref(), Some(expected.as_bytes()), "n={n}, i={i}"); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} + +/// Empty array — the unroll path must short-circuit cleanly. +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_empty() { + let input = VarBinArray::from_iter( + std::iter::empty::>(), + DType::Utf8(Nullability::NonNullable), + ); + let len = input.len(); + let dtype = input.dtype().clone(); + let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG).unwrap(); + assert_eq!(arr.len(), 0); + let mut ctx = SESSION.create_execution_ctx(); + let canonical = arr + .into_array() + .execute::(&mut ctx) + .unwrap(); + assert_eq!(canonical.len(), 0); +} + +/// Filter must share the dictionary — never recompress (this is the +/// regression cause on TPC-H Q22 SF=10). Exercise both selectivities +/// and check that the result is bit-exact and still an OnPairArray. +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_filter_shares_dict() { + let n = 5_000usize; + let strings: Vec = (0..n) + .map(|i| format!("https://www.example.com/items/{i:08}")) + .collect(); + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_bytes())), + DType::Utf8(Nullability::NonNullable), + ); + let arr = + onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG).unwrap(); + let dict_bytes_before = arr.dict_bytes().clone(); + let dict_offsets_len_before = arr.dict_offsets().len(); + + // Keep every 7th row. + let keep: Vec = (0..n).map(|i| i % 7 == 0).collect(); + let mask = vortex_mask::Mask::from_iter(keep.iter().copied()); + let expected: Vec<&str> = strings + .iter() + .enumerate() + .filter_map(|(i, s)| keep[i].then_some(s.as_str())) + .collect(); + + let mut filter_ctx = SESSION.create_execution_ctx(); + let filtered = ::filter(arr.as_view(), &mask, &mut filter_ctx) + .unwrap() + .expect("OnPair filter must return Some"); + assert!( + filtered.is::(), + "filter dropped OnPair encoding: got {}", + filtered.encoding_id() + ); + let typed = filtered.try_downcast::().expect("OnPair"); + // Dict must be byte-identical with the input — no retrain, no copy. + assert_eq!(typed.dict_bytes().as_slice(), dict_bytes_before.as_slice()); + assert_eq!(typed.dict_offsets().len(), dict_offsets_len_before); + assert_eq!(typed.len(), expected.len()); + + let mut ctx = SESSION.create_execution_ctx(); + let canonical = typed + .into_array() + .execute::(&mut ctx) + .unwrap(); + canonical + .with_iterator(|iter| { + let got: Vec>> = iter.map(|b| b.map(|s| s.to_vec())).collect(); + assert_eq!(got.len(), expected.len()); + for (i, want) in expected.iter().enumerate() { + assert_eq!(got[i].as_deref(), Some(want.as_bytes()), "row {i}"); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} + +/// Rebuild an OnPair array, swapping `codes_offsets` for a narrowed +/// (smaller-ptype) primitive copy. Used by the narrowed-child +/// regression tests below. +#[expect(clippy::cognitive_complexity)] +fn narrow_codes_offsets(arr: &crate::OnPairArray, target: PType) -> crate::OnPairArray { + let view = arr.as_view(); + let mut ctx = SESSION.create_execution_ctx(); + let original = view + .codes_offsets() + .clone() + .execute::(&mut ctx) + .unwrap(); + + let narrowed_array = match_each_integer_ptype!(original.ptype(), |SRC| { + let src = original.as_slice::(); + match_each_integer_ptype!(target, |DST| { + let mut buf = BufferMut::::with_capacity(src.len()); + for &v in src { + #[allow( + clippy::unnecessary_cast, + reason = "macro-generated SRC may already be u64" + )] + buf.push(DST::try_from(v as u64).expect("value must fit in target ptype")); + } + PrimitiveArray::new(buf.freeze(), Validity::NonNullable).into_array() + }) + }); + + unsafe { + OnPair::new_unchecked( + view.dtype().clone(), + view.dict_bytes_handle().clone(), + view.dict_offsets().clone(), + view.codes().clone(), + narrowed_array, + view.uncompressed_lengths().clone(), + view.array_validity(), + view.bits(), + ) + } +} + +/// Regression: the cascading compressor can narrow `codes_offsets` +/// from u32 → u16 when every row's token count is small. The previous +/// `filter` impl read the child as `as_slice::()` and panicked +/// with `Other error: Attempted to get slice of type u32 from array +/// of type u16`. The fix dispatches via `match_each_integer_ptype!`. +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_filter_with_narrowed_codes_offsets_u16() { + let n = 200usize; + // Short rows so per-row token counts stay small and codes_offsets + // values fit in u16. (We narrow manually below regardless — this + // matches the shape the cascading compressor produces in the + // wild.) + let strings: Vec = (0..n).map(|i| format!("r{:03}", i)).collect(); + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_bytes())), + DType::Utf8(Nullability::NonNullable), + ); + let arr = + onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG).unwrap(); + + // Force `codes_offsets` to u16 so the panicking pre-fix + // `as_slice::()` would fire. + let arr = narrow_codes_offsets(&arr, PType::U16); + assert_eq!( + arr.as_view().codes_offsets().dtype().as_ptype(), + PType::U16, + "codes_offsets must be u16 to exercise the regression path" + ); + + let keep: Vec = (0..n).map(|i| i % 3 == 0).collect(); + let mask = vortex_mask::Mask::from_iter(keep.iter().copied()); + let expected: Vec<&str> = strings + .iter() + .enumerate() + .filter_map(|(i, s)| keep[i].then_some(s.as_str())) + .collect(); + + let mut filter_ctx = SESSION.create_execution_ctx(); + // Pre-fix: this call panics with "Attempted to get slice of type + // u32 from array of type u16". Post-fix: succeeds. + let filtered = ::filter(arr.as_view(), &mask, &mut filter_ctx) + .unwrap() + .expect("OnPair filter must return Some"); + let typed = filtered.try_downcast::().expect("OnPair"); + assert_eq!(typed.len(), expected.len()); + + let mut ctx = SESSION.create_execution_ctx(); + let canonical = typed + .into_array() + .execute::(&mut ctx) + .unwrap(); + canonical + .with_iterator(|iter| { + let got: Vec>> = iter.map(|b| b.map(|s| s.to_vec())).collect(); + assert_eq!(got.len(), expected.len()); + for (i, want) in expected.iter().enumerate() { + assert_eq!(got[i].as_deref(), Some(want.as_bytes()), "row {i}"); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} + +/// Same regression, narrowed to u8 (smallest possible ptype) — extra +/// coverage that the macro dispatch handles every integer ptype the +/// cascading compressor might pick. +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_filter_with_narrowed_codes_offsets_u8() { + let n = 100usize; + let strings: Vec = (0..n).map(|i| format!("{i}")).collect(); + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_bytes())), + DType::Utf8(Nullability::NonNullable), + ); + let arr = + onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG).unwrap(); + let arr = narrow_codes_offsets(&arr, PType::U8); + assert_eq!(arr.as_view().codes_offsets().dtype().as_ptype(), PType::U8); + + let mask = vortex_mask::Mask::from_iter((0..n).map(|i| i % 2 == 0)); + + let mut filter_ctx = SESSION.create_execution_ctx(); + let filtered = ::filter(arr.as_view(), &mask, &mut filter_ctx) + .unwrap() + .expect("OnPair filter must return Some"); + assert_eq!(filtered.len(), n / 2); +} diff --git a/encodings/experimental/onpair/tests/big_data.rs b/encodings/experimental/onpair/tests/big_data.rs new file mode 100644 index 00000000000..0be025dcfc5 --- /dev/null +++ b/encodings/experimental/onpair/tests/big_data.rs @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! End-to-end smoke test on a realistically-sized input. Validates the +//! pure-Rust decode path and pushdown predicates end-to-end through the new +//! u16-codes layout. + +#![allow( + clippy::cast_possible_truncation, + clippy::redundant_clone, + clippy::tests_outside_test_module, + clippy::use_debug +)] + +use std::sync::LazyLock; +use std::time::Instant; + +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::accessor::ArrayAccessor; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::VarBinArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt; +use vortex_array::builtins::ArrayBuiltins; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::scalar_fn::fns::like::Like; +use vortex_array::scalar_fn::fns::like::LikeOptions; +use vortex_array::scalar_fn::fns::operators::Operator; +use vortex_array::session::ArraySession; +use vortex_onpair::DEFAULT_DICT12_CONFIG; +use vortex_onpair::onpair_compress; +use vortex_session::VortexSession; + +static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + +fn corpus(n: usize) -> Vec { + let templates: &[&str] = &[ + "GET /api/v1/users/{id}/profile HTTP/1.1", + "POST /api/v1/users/{id}/sessions HTTP/1.1", + "GET /static/js/app.{id}.js HTTP/1.1", + "GET /static/css/app.{id}.css HTTP/1.1", + "https://www.example.com/products/{id}", + "https://cdn.example.com/img/{id}.webp", + "https://api.example.com/v2/orders/{id}", + "ftp://files.example.com/dump/{id}.tar.gz", + "ssh://deploy@build-{id}.internal:22", + "redis://cache-{id}.svc.cluster.local:6379", + "INFO request_id={id} method=GET status=200", + "WARN request_id={id} method=POST status=429", + "ERROR request_id={id} method=PUT status=500", + ]; + let mut out = Vec::with_capacity(n); + let mut state = 0x9e37_79b9_7f4a_7c15_u64; + for _ in 0..n { + state = state + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + let pick = (state as usize) % templates.len(); + let id = state as u32; + out.push(templates[pick].replace("{id}", &format!("{:08x}", id))); + } + out +} + +#[test] +#[cfg_attr(miri, ignore)] +fn smoke_100k_rows() { + let n = 100_000; + let strings = corpus(n); + let raw_bytes: usize = strings.iter().map(|s| s.len()).sum(); + + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_bytes())), + DType::Utf8(Nullability::NonNullable), + ); + + let t0 = Instant::now(); + let arr = onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG) + .expect("compress"); + let compress_elapsed = t0.elapsed(); + let bits = arr.bits(); + eprintln!( + "compressed {} rows ({} raw bytes) in {:?}, bits={}", + n, raw_bytes, compress_elapsed, bits + ); + + let arr_ref = arr.into_array(); + let mut ctx = SESSION.create_execution_ctx(); + + // Full canonical round-trip via the pure-Rust decoder. + let t0 = Instant::now(); + let decoded = arr_ref + .clone() + .execute::(&mut ctx) + .expect("canonicalize"); + eprintln!("canonicalized in {:?}", t0.elapsed()); + + assert_eq!(decoded.len(), n); + decoded + .with_iterator(|iter| { + for (i, got) in iter.enumerate() { + let want = strings[i].as_bytes(); + assert_eq!(got, Some(want), "row {} mismatch", i); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); + eprintln!("roundtrip OK on all {} rows", n); + + // Equality pushdown: pick a specific row's value and ensure the kernel + // finds all occurrences. + let needle_row = 42; + let needle = strings[needle_row].clone(); + let want_eq = strings.iter().filter(|s| **s == needle).count(); + let eq = arr_ref + .binary( + ConstantArray::new(needle.as_str(), n).into_array(), + Operator::Eq, + ) + .unwrap() + .execute::(&mut ctx) + .unwrap() + .into_array(); + assert_eq!(eq.as_bool_typed().true_count().unwrap(), want_eq); + eprintln!("eq pushdown matches reference count ({})", want_eq); + + // Prefix pushdown. + let prefix = "https://www."; + let want_prefix = strings.iter().filter(|s| s.starts_with(prefix)).count(); + let pat = ConstantArray::new(format!("{prefix}%").as_str(), n).into_array(); + let got_prefix = Like + .try_new_array(n, LikeOptions::default(), [arr_ref.clone(), pat]) + .unwrap() + .into_array() + .execute::(&mut ctx) + .unwrap() + .into_array() + .as_bool_typed() + .true_count() + .unwrap(); + assert_eq!(got_prefix, want_prefix); + eprintln!("starts_with pushdown matches reference ({})", want_prefix); + + // Contains pushdown. + let sub = "status=500"; + let want_sub = strings.iter().filter(|s| s.contains(sub)).count(); + let pat = ConstantArray::new(format!("%{sub}%").as_str(), n).into_array(); + let got_sub = Like + .try_new_array(n, LikeOptions::default(), [arr_ref.clone(), pat]) + .unwrap() + .into_array() + .execute::(&mut ctx) + .unwrap() + .into_array() + .as_bool_typed() + .true_count() + .unwrap(); + assert_eq!(got_sub, want_sub); + eprintln!("contains pushdown matches reference ({})", want_sub); +} diff --git a/vortex-btrblocks/Cargo.toml b/vortex-btrblocks/Cargo.toml index 40b0ae52aae..493c1684318 100644 --- a/vortex-btrblocks/Cargo.toml +++ b/vortex-btrblocks/Cargo.toml @@ -30,6 +30,7 @@ vortex-error = { workspace = true } vortex-fastlanes = { workspace = true } vortex-fsst = { workspace = true } vortex-mask = { workspace = true } +vortex-onpair = { workspace = true, optional = true } vortex-pco = { workspace = true, optional = true } vortex-runend = { workspace = true } vortex-sequence = { workspace = true } @@ -48,7 +49,7 @@ vortex-session = { workspace = true } [features] # This feature enabled unstable encodings for which we don't guarantee stability. -unstable_encodings = ["dep:vortex-tensor", "vortex-zstd?/unstable_encodings"] +unstable_encodings = ["dep:vortex-tensor", "dep:vortex-onpair", "vortex-zstd?/unstable_encodings"] pco = ["dep:pco", "dep:vortex-pco"] zstd = ["dep:vortex-zstd"] diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs index ab77f625764..930b2d405d9 100644 --- a/vortex-btrblocks/src/builder.rs +++ b/vortex-btrblocks/src/builder.rs @@ -53,7 +53,11 @@ pub const ALL_SCHEMES: &[&dyn Scheme] = &[ // String schemes. //////////////////////////////////////////////////////////////////////////////////////////////// &string::StringDictScheme, + // Both string-fragmentation schemes are registered; the sample-based + // selector keeps whichever is smaller per column. &string::FSSTScheme, + #[cfg(feature = "unstable_encodings")] + &string::OnPairScheme, &string::StringConstantScheme, &string::NullDominatedSparseScheme, // Decimal schemes. @@ -168,14 +172,22 @@ impl BtrBlocksCompressorBuilder { /// preserves the array buffer layout for zero-conversion GPU decompression. Without it, /// interleaved Zstd compression is used. pub fn only_cuda_compatible(self) -> Self { - let builder = self.exclude_schemes([ + // String fragmentation schemes (OnPair, FSST) require host-side + // dictionary expansion at decode time, which is incompatible with + // pure-GPU decompression paths. Strip whichever string-fragment + // scheme is enabled by feature. + #[cfg_attr(not(feature = "unstable_encodings"), allow(unused_mut))] + let mut excluded: Vec = vec![ integer::SparseScheme.id(), integer::IntRLEScheme.id(), float::FloatRLEScheme.id(), float::NullDominatedSparseScheme.id(), string::StringDictScheme.id(), string::FSSTScheme.id(), - ]); + ]; + #[cfg(feature = "unstable_encodings")] + excluded.push(string::OnPairScheme.id()); + let builder = self.exclude_schemes(excluded); #[cfg(all(feature = "zstd", feature = "unstable_encodings"))] let builder = builder.with_new_scheme(&string::ZstdBuffersScheme); diff --git a/vortex-btrblocks/src/schemes/integer.rs b/vortex-btrblocks/src/schemes/integer.rs index 183d0c32fb3..5161e084c6f 100644 --- a/vortex-btrblocks/src/schemes/integer.rs +++ b/vortex-btrblocks/src/schemes/integer.rs @@ -903,7 +903,7 @@ pub(crate) fn rle_compress( } #[cfg(feature = "unstable_encodings")] -fn try_compress_delta( +pub(crate) fn try_compress_delta( compressor: &CascadingCompressor, child: &ArrayRef, parent_ctx: &CompressorContext, diff --git a/vortex-btrblocks/src/schemes/string.rs b/vortex-btrblocks/src/schemes/string.rs index 47bea1670a2..753a1deb8f4 100644 --- a/vortex-btrblocks/src/schemes/string.rs +++ b/vortex-btrblocks/src/schemes/string.rs @@ -21,11 +21,21 @@ use vortex_fsst::FSST; use vortex_fsst::FSSTArrayExt; use vortex_fsst::fsst_compress; use vortex_fsst::fsst_train_compressor; +#[cfg(feature = "unstable_encodings")] +use vortex_onpair::DEFAULT_DICT12_CONFIG; +#[cfg(feature = "unstable_encodings")] +use vortex_onpair::OnPair; +#[cfg(feature = "unstable_encodings")] +use vortex_onpair::OnPairArrayExt; +#[cfg(feature = "unstable_encodings")] +use vortex_onpair::onpair_compress; use vortex_sparse::Sparse; use vortex_sparse::SparseExt as _; use super::integer::IntDictScheme; use super::integer::SparseScheme as IntSparseScheme; +#[cfg(feature = "unstable_encodings")] +use super::integer::try_compress_delta; use crate::ArrayAndStats; use crate::CascadingCompressor; use crate::CompressorContext; @@ -33,9 +43,25 @@ use crate::Scheme; use crate::SchemeExt; /// FSST (Fast Static Symbol Table) compression. +/// +/// One of the two string-fragmentation schemes in the default [`ALL_SCHEMES`] +/// (alongside [`OnPairScheme`]); the sample-based selector keeps whichever is +/// smaller per column. FSST compresses faster, OnPair usually wins on ratio. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct FSSTScheme; +/// OnPair short-string compression (dict-12). +/// +/// A default string-fragmentation scheme (alongside [`FSSTScheme`]) — targets +/// large columns of short-to-medium strings with high lexical overlap, like +/// URLs or log lines. Uses a learned dictionary of frequent adjacent substrings +/// (built by the OnPair trainer at compress time) and 12-bit token codes stored +/// as a u16 child, with offsets / uncompressed-lengths flowing through the +/// cascading compressor like any other primitive children. +#[cfg(feature = "unstable_encodings")] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct OnPairScheme; + /// Sparse encoding for null-dominated arrays. /// /// This is the same as the integer `SparseScheme`, but we only use this for null-dominated arrays. @@ -138,6 +164,160 @@ impl Scheme for FSSTScheme { } } +#[cfg(feature = "unstable_encodings")] +impl Scheme for OnPairScheme { + fn scheme_name(&self) -> &'static str { + "vortex.string.onpair" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) + } + + /// One slot child: `uncompressed_lengths`. The dictionary blob, dictionary + /// offsets, codes (u16), and codes offsets all live as raw byte buffers + /// on the OnPair array — they're not primitive slot children, so the + /// cascading compressor doesn't recompress them. Codes intentionally + /// 4 primitive slot children flow through the cascading compressor: + /// `dict_offsets` (u32 → typically `FoR`/`BitPacked`), `codes` (u16 → + /// `FastLanes::BitPacked` to exactly `bits` = 12 by default), + /// `codes_offsets` (u32 → `FoR`), `uncompressed_lengths` (i32 → narrow + /// + `FoR`). Validity stays untouched. + fn num_children(&self) -> usize { + 4 + } + + fn expected_compression_ratio( + &self, + _data: &ArrayAndStats, + _compress_ctx: CompressorContext, + _exec_ctx: &mut ExecutionCtx, + ) -> CompressionEstimate { + CompressionEstimate::Deferred(DeferredEstimate::Sample) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &ArrayAndStats, + compress_ctx: CompressorContext, + exec_ctx: &mut ExecutionCtx, + ) -> VortexResult { + let utf8 = data.array_as_utf8().into_owned(); + let onpair_array = onpair_compress(&utf8, utf8.len(), utf8.dtype(), DEFAULT_DICT12_CONFIG)?; + + let dict_offsets = compress_offsets_child( + compressor, + onpair_array.dict_offsets(), + &compress_ctx, + self.id(), + 0, + exec_ctx, + )?; + let codes = compress_primitive_child( + compressor, + onpair_array.codes(), + &compress_ctx, + self.id(), + 1, + exec_ctx, + )?; + let codes_offsets = compress_offsets_child( + compressor, + onpair_array.codes_offsets(), + &compress_ctx, + self.id(), + 2, + exec_ctx, + )?; + let uncompressed_lengths = compress_primitive_child( + compressor, + onpair_array.uncompressed_lengths(), + &compress_ctx, + self.id(), + 3, + exec_ctx, + )?; + + Ok(OnPair::try_new( + onpair_array.dtype().clone(), + onpair_array.dict_bytes_handle().clone(), + dict_offsets, + codes, + codes_offsets, + uncompressed_lengths, + onpair_array.array_validity(), + onpair_array.bits(), + )? + .into_array()) + } +} + +/// Narrow a primitive child to its tightest int type, then forward it to +/// the cascading compressor. +#[cfg(feature = "unstable_encodings")] +fn compress_primitive_child( + compressor: &CascadingCompressor, + child: &ArrayRef, + compress_ctx: &CompressorContext, + scheme_id: vortex_compressor::scheme::SchemeId, + child_idx: usize, + exec_ctx: &mut ExecutionCtx, +) -> VortexResult { + let narrowed = child + .clone() + .execute::(exec_ctx)? + .narrow(exec_ctx)? + .into_array(); + compressor.compress_child(&narrowed, compress_ctx, scheme_id, child_idx, exec_ctx) +} + +/// Minimum child length before delta is even attempted. Delta carries fixed +/// overhead (a separate `bases` array plus FastLanes' 1024-element lane +/// packing), so on short children it can only lose. +#[cfg(feature = "unstable_encodings")] +const OFFSETS_DELTA_MIN_LEN: usize = 2048; + +/// Compress a monotonic offsets child. For children of at least +/// [`OFFSETS_DELTA_MIN_LEN`] it tries both the normal cascading path and a +/// delta path and keeps whichever produces fewer bytes; shorter children skip +/// delta entirely. `dict_offsets` and `codes_offsets` are cumulative +/// (monotonic), so delta (per-entry deltas) usually packs much tighter than +/// FoR+bitpacking over the full range. +#[cfg(feature = "unstable_encodings")] +fn compress_offsets_child( + compressor: &CascadingCompressor, + child: &ArrayRef, + compress_ctx: &CompressorContext, + scheme_id: vortex_compressor::scheme::SchemeId, + child_idx: usize, + exec_ctx: &mut ExecutionCtx, +) -> VortexResult { + let narrowed = child + .clone() + .execute::(exec_ctx)? + .narrow(exec_ctx)? + .into_array(); + let plain = + compressor.compress_child(&narrowed, compress_ctx, scheme_id, child_idx, exec_ctx)?; + if narrowed.len() < OFFSETS_DELTA_MIN_LEN { + return Ok(plain); + } + let delta = try_compress_delta( + compressor, + &narrowed, + compress_ctx, + scheme_id, + child_idx, + exec_ctx, + )?; + if delta.nbytes() < plain.nbytes() { + Ok(delta) + } else { + Ok(plain) + } +} + impl Scheme for NullDominatedSparseScheme { fn scheme_name(&self) -> &'static str { "vortex.string.sparse" @@ -411,8 +591,25 @@ mod scheme_selection_tests { Ok(()) } + #[cfg(feature = "unstable_encodings")] + #[test] + fn test_onpair_in_default_scheme_list() { + use crate::SchemeExt; + use crate::schemes::string::OnPairScheme; + + let ids: Vec<_> = crate::ALL_SCHEMES.iter().map(|s| s.id()).collect(); + assert!( + ids.contains(&OnPairScheme.id()), + "OnPairScheme not registered in ALL_SCHEMES" + ); + } + + #[cfg(feature = "unstable_encodings")] #[test] - fn test_fsst_compressed() -> VortexResult<()> { + fn test_onpair_compressed() -> VortexResult<()> { + // Dictionary-style string corpus: high lexical overlap, short rows. + // OnPair beats FSST on this corpus, so it wins the sample-based + // comparison even though both are registered by default. let mut strings = Vec::with_capacity(1000); for i in 0..1000 { strings.push(Some(format!( @@ -423,7 +620,48 @@ mod scheme_selection_tests { let array_ref = array.into_array(); let compressed = BtrBlocksCompressor::default() .compress(&array_ref, &mut SESSION.create_execution_ctx())?; - assert!(compressed.is::()); + assert!( + compressed.is::(), + "expected OnPair, got {}", + compressed.encoding_id() + ); + Ok(()) + } + + /// FSST is registered in the default scheme list (alongside OnPair), and an + /// FSST-only builder still produces an FSST array. + #[test] + fn test_fsst_in_default_scheme_list() -> VortexResult<()> { + use crate::BtrBlocksCompressorBuilder; + use crate::SchemeExt; + use crate::schemes::string::FSSTScheme; + + // FSST is registered by default. + assert!( + crate::ALL_SCHEMES.iter().any(|s| s.id() == FSSTScheme.id()), + "FSSTScheme should be in ALL_SCHEMES", + ); + + // An FSST-only builder still produces an FSST array for FSST-favourable + // input. + let mut strings = Vec::with_capacity(1000); + for i in 0..1000 { + strings.push(Some(format!( + "this_is_a_common_prefix_with_some_variation_{i}_and_a_common_suffix_pattern" + ))); + } + let array = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable)); + let array_ref = array.into_array(); + + let compressor = BtrBlocksCompressorBuilder::empty() + .with_new_scheme(&FSSTScheme) + .build(); + let compressed = compressor.compress(&array_ref, &mut SESSION.create_execution_ctx())?; + assert!( + compressed.is::(), + "expected FSST when only FSSTScheme is registered, got {}", + compressed.encoding_id() + ); Ok(()) } } diff --git a/vortex-btrblocks/tests/onpair_roundtrip.rs b/vortex-btrblocks/tests/onpair_roundtrip.rs new file mode 100644 index 00000000000..1cef6d471b8 --- /dev/null +++ b/vortex-btrblocks/tests/onpair_roundtrip.rs @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! End-to-end round-trip through the full Vortex compressor + decompressor +//! on string arrays. Lives in `vortex-btrblocks` (gated on `unstable_encodings`) +//! so it exercises the same code path the file writer uses, not just the +//! OnPair crate in isolation. + +#![cfg(feature = "unstable_encodings")] +#![allow( + clippy::cast_possible_truncation, + clippy::tests_outside_test_module, + clippy::use_debug +)] + +use std::sync::LazyLock; + +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::accessor::ArrayAccessor; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::session::ArraySession; +use vortex_btrblocks::BtrBlocksCompressor; +use vortex_session::VortexSession; + +static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + +/// Helper: synthetic short-string corpus that the cascading compressor should +/// route through OnPair. +fn corpus(n: usize) -> Vec { + let templates: &[&str] = &[ + "https://www.example.com/products/{id}", + "https://cdn.example.com/img/{id}.webp", + "https://api.example.com/v2/orders/{id}", + "https://www.example.com/users/{id}/profile", + "INFO request_id={id} status=200 method=GET", + "WARN request_id={id} status=429 method=POST", + "ERROR request_id={id} status=500 method=PUT", + ]; + let mut out = Vec::with_capacity(n); + let mut state = 0x9e37_79b9_7f4a_7c15_u64; + for _ in 0..n { + state = state + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + let pick = (state as usize) % templates.len(); + let id = state as u32; + out.push(templates[pick].replace("{id}", &format!("{:08x}", id))); + } + out +} + +#[test] +fn nonnullable_roundtrip_via_default_compressor() { + let n = 4096; + let strings = corpus(n); + let array = VarBinViewArray::from_iter( + strings.iter().map(|s| Some(s.as_str())), + DType::Utf8(Nullability::NonNullable), + ) + .into_array(); + + let compressed = BtrBlocksCompressor::default() + .compress(&array, &mut SESSION.create_execution_ctx()) + .expect("compress"); + // Don't assert a specific scheme — both OnPair and FSST are registered and + // the sample-based selector keeps whichever is smaller. What matters is the + // round-trip. + + let decoded = compressed + .execute::(&mut SESSION.create_execution_ctx()) + .expect("decompress"); + assert_eq!(decoded.len(), n); + decoded + .with_iterator(|iter| { + for (i, got) in iter.enumerate() { + assert_eq!( + got, + Some(strings[i].as_bytes()), + "mismatch at row {i}: got {:?}", + got.map(|b| String::from_utf8_lossy(b).into_owned()), + ); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} + +#[test] +fn nullable_roundtrip_via_default_compressor() { + let n = 2048; + let strings: Vec> = corpus(n) + .into_iter() + .enumerate() + .map(|(i, s)| (i % 7 != 0).then_some(s)) + .collect(); + + let array = VarBinViewArray::from_iter( + strings.iter().map(|s| s.as_deref()), + DType::Utf8(Nullability::Nullable), + ) + .into_array(); + + let compressed = BtrBlocksCompressor::default() + .compress(&array, &mut SESSION.create_execution_ctx()) + .expect("compress"); + // Don't assert OnPair specifically here — the sample-based selector may + // pick a different scheme on tiny inputs. What matters is the round-trip. + + let decoded = compressed + .execute::(&mut SESSION.create_execution_ctx()) + .expect("decompress"); + assert_eq!(decoded.len(), n); + decoded + .with_iterator(|iter| { + for (i, got) in iter.enumerate() { + let want = strings[i].as_deref().map(str::as_bytes); + assert_eq!(got, want, "mismatch at row {i}"); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} + +#[test] +fn empty_and_short_string_roundtrip() { + // Edge cases: empty strings interleaved with short ones. + let strings = vec!["", "a", "", "bb", "ccc", "", "dddd", "eeeee", ""]; + let array = VarBinViewArray::from_iter( + strings.iter().map(|s| Some(*s)), + DType::Utf8(Nullability::NonNullable), + ) + .into_array(); + + let compressed = BtrBlocksCompressor::default() + .compress(&array, &mut SESSION.create_execution_ctx()) + .expect("compress"); + let decoded = compressed + .execute::(&mut SESSION.create_execution_ctx()) + .expect("decompress"); + decoded + .with_iterator(|iter| { + let got: Vec<_> = iter.collect(); + for (i, want) in strings.iter().enumerate() { + assert_eq!(got[i], Some(want.as_bytes()), "row {i}"); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} diff --git a/vortex-file/Cargo.toml b/vortex-file/Cargo.toml index 77d664a12cb..c4bf980d683 100644 --- a/vortex-file/Cargo.toml +++ b/vortex-file/Cargo.toml @@ -46,6 +46,7 @@ vortex-io = { workspace = true } vortex-layout = { workspace = true } vortex-mask = { workspace = true } vortex-metrics = { workspace = true } +vortex-onpair = { workspace = true, optional = true } vortex-pco = { workspace = true } vortex-runend = { workspace = true } vortex-scan = { workspace = true } @@ -78,6 +79,7 @@ tokio = [ zstd = ["dep:vortex-zstd", "vortex-btrblocks/zstd", "vortex-btrblocks/pco"] # This feature enables unstable encodings for which we don't guarantee stability. unstable_encodings = [ + "dep:vortex-onpair", "dep:vortex-tensor", "vortex-zstd?/unstable_encodings", "vortex-btrblocks/unstable_encodings", diff --git a/vortex-file/src/lib.rs b/vortex-file/src/lib.rs index e69b5848de2..9b927abff54 100644 --- a/vortex-file/src/lib.rs +++ b/vortex-file/src/lib.rs @@ -115,6 +115,8 @@ use vortex_array::arrays::patched::use_experimental_patches; use vortex_array::session::ArraySessionExt; use vortex_bytebool::ByteBool; use vortex_fsst::FSST; +#[cfg(feature = "unstable_encodings")] +use vortex_onpair::OnPair; use vortex_pco::Pco; use vortex_session::VortexSession; use vortex_zigzag::ZigZag; @@ -162,6 +164,8 @@ pub fn register_default_encodings(session: &VortexSession) { arrays.register(ByteBool); arrays.register(Dict); arrays.register(FSST); + #[cfg(feature = "unstable_encodings")] + arrays.register(OnPair); arrays.register(Pco); arrays.register(ZigZag); #[cfg(feature = "zstd")] diff --git a/vortex-file/src/strategy.rs b/vortex-file/src/strategy.rs index 661e72240d1..ed58f32e11d 100644 --- a/vortex-file/src/strategy.rs +++ b/vortex-file/src/strategy.rs @@ -53,6 +53,8 @@ use vortex_layout::layouts::repartition::RepartitionWriterOptions; use vortex_layout::layouts::table::TableStrategy; use vortex_layout::layouts::zoned::writer::ZonedLayoutOptions; use vortex_layout::layouts::zoned::writer::ZonedStrategy; +#[cfg(feature = "unstable_encodings")] +use vortex_onpair::OnPair; use vortex_pco::Pco; use vortex_runend::RunEnd; use vortex_sequence::Sequence; @@ -102,6 +104,8 @@ pub static ALLOWED_ENCODINGS: LazyLock> = LazyLock::new(|| { allowed.insert(Delta.id()); allowed.insert(FoR.id()); allowed.insert(FSST.id()); + #[cfg(feature = "unstable_encodings")] + allowed.insert(OnPair.id()); allowed.insert(Pco.id()); allowed.insert(RLE.id()); allowed.insert(RunEnd.id()); diff --git a/vortex-file/tests/test_onpair_string_roundtrip.rs b/vortex-file/tests/test_onpair_string_roundtrip.rs new file mode 100644 index 00000000000..803c6869e46 --- /dev/null +++ b/vortex-file/tests/test_onpair_string_roundtrip.rs @@ -0,0 +1,404 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Round-trip stress tests for OnPair through the full Vortex file writer + +//! reader. Mirrors the call shape `vortex-bench/src/conversions.rs` uses and +//! the multi-column, many-chunk pattern of TPC-H tables (`supplier_0.vortex` +//! is the file from which CI surfaced +//! `Misaligned buffer cannot be used to build PrimitiveArray of u32`). + +#![cfg(feature = "unstable_encodings")] +#![expect( + clippy::cast_possible_truncation, + clippy::tests_outside_test_module, + clippy::redundant_clone +)] + +use std::sync::LazyLock; + +use futures::StreamExt; +use futures::pin_mut; +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::accessor::ArrayAccessor; +use vortex_array::aggregate_fn::session::AggregateFnSession; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::StructArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::struct_::StructArrayExt; +use vortex_array::dtype::DType; +use vortex_array::dtype::FieldNames; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::session::DTypeSession; +use vortex_array::optimizer::kernels::ArrayKernels; +use vortex_array::scalar_fn::session::ScalarFnSession; +use vortex_array::session::ArraySession; +use vortex_array::validity::Validity; +use vortex_buffer::ByteBuffer; +use vortex_file::OpenOptionsSessionExt; +use vortex_file::WriteOptionsSessionExt; +use vortex_io::session::RuntimeSession; +use vortex_layout::session::LayoutSession; +use vortex_session::VortexSession; + +/// Full default Vortex session — the same set of sub-sessions +/// `vortex::VortexSession::default()` would install, plus +/// `register_default_encodings`. Built inline here because `vortex-file` +/// can't depend on the umbrella `vortex` crate (it's the other way round). +static SESSION: LazyLock = LazyLock::new(|| { + let session = VortexSession::empty() + .with::() + .with::() + .with::() + .with::() + .with::() + .with::() + .with::(); + vortex_file::register_default_encodings(&session); + session +}); + +fn corpus(n: usize, offset: u64) -> Vec { + let templates: &[&str] = &[ + "https://www.example.com/products/{id}", + "https://cdn.example.com/img/{id}.webp", + "https://api.example.com/v2/orders/{id}", + "https://www.example.com/users/{id}/profile", + "INFO request_id={id} status=200 method=GET", + "WARN request_id={id} status=429 method=POST", + "ERROR request_id={id} status=500 method=PUT", + ]; + let mut out = Vec::with_capacity(n); + let mut state = 0x9e37_79b9_7f4a_7c15_u64.wrapping_add(offset); + for _ in 0..n { + state = state + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + let pick = (state as usize) % templates.len(); + let id = state as u32; + out.push(templates[pick].replace("{id}", &format!("{id:08x}"))); + } + out +} + +/// Write `data` to an in-memory `Vec` using the **full default Vortex +/// compressor** (`WriteStrategyBuilder::default()` = +/// `BtrBlocksCompressor::default()` cascading through every registered +/// scheme, including OnPair), then open the resulting bytes via +/// `OpenOptions::open_buffer` and stream every chunk back. +async fn write_and_read_back(data: vortex_array::ArrayRef) -> Vec { + // `write_options()` builds a `VortexWriteOptions` whose `strategy` is + // `WriteStrategyBuilder::default().build()` — the same path `vortex-bench` + // uses for Parquet → Vortex conversion. No custom strategy injected. + let mut bytes = Vec::new(); + SESSION + .write_options() + .write(&mut bytes, data.to_array_stream()) + .await + .expect("write Vortex file"); + + // Read back from the in-memory byte buffer; no disk, no FS. + let bytes = ByteBuffer::from(bytes); + let vxf = SESSION.open_options().open_buffer(bytes).expect("open"); + + let stream = vxf + .scan() + .expect("scan") + .into_stream() + .expect("into_stream"); + pin_mut!(stream); + + let mut chunks = Vec::new(); + while let Some(chunk) = stream.next().await { + chunks.push(chunk.expect("chunk")); + } + chunks +} + +/// Single string column, single chunk. The simplest case. +#[tokio::test] +async fn single_column_single_chunk() { + let n = 4096usize; + let strings = corpus(n, 0); + let str_array = VarBinViewArray::from_iter( + strings.iter().map(|s| Some(s.as_str())), + DType::Utf8(Nullability::NonNullable), + ) + .into_array(); + let data = StructArray::new( + FieldNames::from(["url"]), + vec![str_array], + n, + Validity::NonNullable, + ) + .into_array(); + + let chunks = write_and_read_back(data).await; + let mut row = 0; + for chunk in chunks { + let strct = chunk + .try_downcast::() + .expect("Struct"); + let url = strct.unmasked_field(0).clone(); + let mut ctx = SESSION.create_execution_ctx(); + let url = url.execute::(&mut ctx).expect("canon"); + url.with_iterator(|iter| { + for b in iter { + assert_eq!(b, Some(strings[row].as_bytes()), "row {row}"); + row += 1; + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); + } + assert_eq!(row, n); +} + +/// Many rows → many chunks via the writer's default row_block_size. +#[tokio::test] +async fn single_column_many_chunks() { + let n = 50_000usize; + let strings = corpus(n, 0); + let str_array = VarBinViewArray::from_iter( + strings.iter().map(|s| Some(s.as_str())), + DType::Utf8(Nullability::NonNullable), + ) + .into_array(); + let data = StructArray::new( + FieldNames::from(["url"]), + vec![str_array], + n, + Validity::NonNullable, + ) + .into_array(); + + let chunks = write_and_read_back(data).await; + let mut row = 0; + for chunk in chunks { + let strct = chunk + .try_downcast::() + .expect("Struct"); + let url = strct.unmasked_field(0).clone(); + let mut ctx = SESSION.create_execution_ctx(); + let url = url.execute::(&mut ctx).expect("canon"); + url.with_iterator(|iter| { + for b in iter { + assert_eq!(b, Some(strings[row].as_bytes()), "row {row}"); + row += 1; + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); + } + assert_eq!(row, n); +} + +/// TPC-H supplier-shaped table: 5 string columns + a primary key + a +/// foreign key + a decimal/integer, with the row count large enough to +/// exercise multiple chunks. This is the configuration that surfaced the +/// `Misaligned buffer` error in CI. +#[tokio::test] +async fn tpch_supplier_shape() { + let n = 32_000usize; + let names = corpus(n, 1); + let addresses = corpus(n, 2); + let phones = corpus(n, 3); + let comments = corpus(n, 4); + let cities = corpus(n, 5); + + let suppkey: Vec = (0..n as i64).collect(); + let nationkey: Vec = (0..n as i32).map(|i| i % 25).collect(); + let acctbal: Vec = (0..n as i64).map(|i| (i * 13) % 1_000_000).collect(); + + let mk_str = |v: &[String]| -> vortex_array::ArrayRef { + VarBinViewArray::from_iter( + v.iter().map(|s| Some(s.as_str())), + DType::Utf8(Nullability::NonNullable), + ) + .into_array() + }; + + let data = StructArray::new( + FieldNames::from([ + "s_suppkey", + "s_name", + "s_address", + "s_nationkey", + "s_phone", + "s_acctbal", + "s_comment", + "s_city", + ]), + vec![ + PrimitiveArray::from_iter(suppkey.iter().copied()).into_array(), + mk_str(&names), + mk_str(&addresses), + PrimitiveArray::from_iter(nationkey.iter().copied()).into_array(), + mk_str(&phones), + PrimitiveArray::from_iter(acctbal.iter().copied()).into_array(), + mk_str(&comments), + mk_str(&cities), + ], + n, + Validity::NonNullable, + ) + .into_array(); + + let chunks = write_and_read_back(data).await; + + let mut row = 0; + for chunk in chunks { + let strct = chunk + .try_downcast::() + .expect("Struct"); + let chunk_len = strct.as_ref().len(); + let mut ctx = SESSION.create_execution_ctx(); + + let name = strct + .unmasked_field(1) + .clone() + .execute::(&mut ctx) + .unwrap(); + let address = strct + .unmasked_field(2) + .clone() + .execute::(&mut ctx) + .unwrap(); + let phone = strct + .unmasked_field(4) + .clone() + .execute::(&mut ctx) + .unwrap(); + let comment = strct + .unmasked_field(6) + .clone() + .execute::(&mut ctx) + .unwrap(); + let city = strct + .unmasked_field(7) + .clone() + .execute::(&mut ctx) + .unwrap(); + + for (s, want) in [ + (&name, &names), + (&address, &addresses), + (&phone, &phones), + (&comment, &comments), + (&city, &cities), + ] { + let base = row; + s.with_iterator(|iter| { + for (i, b) in iter.enumerate() { + assert_eq!(b, Some(want[base + i].as_bytes()), "row {}", base + i); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); + } + row += chunk_len; + } + assert_eq!(row, n); +} + +/// 30 short fixed strings where the dictionary blob length is unlikely to +/// be a multiple of 4. Earlier buffer orderings (dict_bytes first) tripped +/// the segment writer's first-buffer-only alignment, surfacing +/// `Misaligned buffer cannot be used to build PrimitiveArray of u32` on +/// read. +#[tokio::test] +async fn odd_dict_length_alignment() { + let words: &[&str] = &[ + "a", "bb", "ccc", "dddd", "eeeee", "fffff", "ggggggg", "h", "ii", "jjj", + ]; + let n = 20_000usize; + let strings: Vec<&str> = (0..n).map(|i| words[i % words.len()]).collect(); + let str_array = VarBinViewArray::from_iter( + strings.iter().map(|s| Some(*s)), + DType::Utf8(Nullability::NonNullable), + ) + .into_array(); + let data = StructArray::new( + FieldNames::from(["w"]), + vec![str_array], + n, + Validity::NonNullable, + ) + .into_array(); + + let chunks = write_and_read_back(data).await; + let mut row = 0; + for chunk in chunks { + let strct = chunk + .try_downcast::() + .expect("Struct"); + let mut ctx = SESSION.create_execution_ctx(); + let s = strct + .unmasked_field(0) + .clone() + .execute::(&mut ctx) + .unwrap(); + s.with_iterator(|iter| { + for b in iter { + assert_eq!(b, Some(strings[row].as_bytes()), "row {row}"); + row += 1; + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); + } + assert_eq!(row, n); +} + +/// Mixed-shape strings: empty, short, very long, with a fair chunk of nulls +/// — exercising the validity child + edge offsets. +#[tokio::test] +async fn nullable_and_extreme_shapes() { + let n = 16_000usize; + let mut strings: Vec> = Vec::with_capacity(n); + for i in 0..n { + match i % 11 { + 0 => strings.push(None), + 1 => strings.push(Some(String::new())), + 2 => strings.push(Some("a".repeat(1024))), + 3 => strings.push(Some(format!("row-{i}"))), + _ => strings.push(Some(corpus(1, i as u64).pop().unwrap())), + } + } + let str_array = VarBinViewArray::from_iter( + strings.iter().map(|s| s.as_deref()), + DType::Utf8(Nullability::Nullable), + ) + .into_array(); + let data = StructArray::new( + FieldNames::from(["s"]), + vec![str_array], + n, + Validity::NonNullable, + ) + .into_array(); + + let chunks = write_and_read_back(data).await; + let mut row = 0; + for chunk in chunks { + let strct = chunk + .try_downcast::() + .expect("Struct"); + let mut ctx = SESSION.create_execution_ctx(); + let s = strct + .unmasked_field(0) + .clone() + .execute::(&mut ctx) + .unwrap(); + s.with_iterator(|iter| { + for b in iter { + let want = strings[row].as_deref().map(str::as_bytes); + assert_eq!(b, want, "row {row}"); + row += 1; + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); + } + assert_eq!(row, n); +} From 383ef64d492eb7386b0cd987e60b7f321ab941a6 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 08:56:13 +0000 Subject: [PATCH 02/27] Clean up onpair encoding: drop dead public API and stale comments - Remove unused public DEFAULT_BITS const and config_with_bits fn (plus their public-api.lock entries). - Drop stale "C++" references in comments; the algorithm is the pure-Rust onpair crate, not the old FFI shim. Co-Authored-By: Claude Opus 4.7 Signed-off-by: Joe Isaacs --- encodings/experimental/onpair/public-api.lock | 4 ---- encodings/experimental/onpair/src/array.rs | 4 ---- encodings/experimental/onpair/src/compress.rs | 10 +--------- encodings/experimental/onpair/src/compute/filter.rs | 2 +- encodings/experimental/onpair/src/decode.rs | 3 +-- 5 files changed, 3 insertions(+), 20 deletions(-) diff --git a/encodings/experimental/onpair/public-api.lock b/encodings/experimental/onpair/public-api.lock index bf1694761b3..247d1352723 100644 --- a/encodings/experimental/onpair/public-api.lock +++ b/encodings/experimental/onpair/public-api.lock @@ -220,8 +220,6 @@ pub fn vortex_onpair::OnPairMetadata::clear(&mut self) pub fn vortex_onpair::OnPairMetadata::encoded_len(&self) -> usize -pub const vortex_onpair::DEFAULT_BITS: u32 - pub const vortex_onpair::DEFAULT_DICT12_CONFIG: onpair::Config pub const vortex_onpair::MAX_TOKEN_SIZE: usize @@ -250,8 +248,6 @@ pub fn T::dict_offsets(&self) -> &vortex_array::array::erased::ArrayRef pub fn T::uncompressed_lengths(&self) -> &vortex_array::array::erased::ArrayRef -pub fn vortex_onpair::config_with_bits(u32) -> onpair::Config - pub fn vortex_onpair::onpair_compress>(A, usize, &vortex_array::dtype::DType, onpair::Config) -> vortex_error::VortexResult pub fn vortex_onpair::onpair_compress_array(&vortex_array::array::erased::ArrayRef, onpair::Config, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult diff --git a/encodings/experimental/onpair/src/array.rs b/encodings/experimental/onpair/src/array.rs index f39656cec58..9f83427067e 100644 --- a/encodings/experimental/onpair/src/array.rs +++ b/encodings/experimental/onpair/src/array.rs @@ -51,10 +51,6 @@ use crate::rules::RULES; /// An [`OnPair`]-encoded Vortex array. pub type OnPairArray = Array; -/// Default bits-per-token preset used by [`crate::onpair_compress`]: 12-bit -/// codes, dictionary capped at 4 096 entries. -pub const DEFAULT_BITS: u32 = 12; - /// Wire-format metadata persisted alongside the OnPair buffer + slot children. /// /// On disk the layout is FSST-shape: diff --git a/encodings/experimental/onpair/src/compress.rs b/encodings/experimental/onpair/src/compress.rs index 50fa3189a0a..bb3b6f95e88 100644 --- a/encodings/experimental/onpair/src/compress.rs +++ b/encodings/experimental/onpair/src/compress.rs @@ -29,15 +29,7 @@ use crate::OnPairArray; /// Default OnPair training configuration: 12-bit codes ("dict-12"). pub const DEFAULT_DICT12_CONFIG: Config = onpair::DEFAULT_CONFIG; -/// Build a training config with a custom bit width. -pub fn config_with_bits(bits: u32) -> Config { - Config { - bits, - ..onpair::DEFAULT_CONFIG - } -} - -/// Compress an iterable of optional byte strings via the OnPair C++ library. +/// Compress an iterable of optional byte strings via the OnPair encoder. pub fn onpair_compress_iter<'a, I>( iter: I, len: usize, diff --git a/encodings/experimental/onpair/src/compute/filter.rs b/encodings/experimental/onpair/src/compute/filter.rs index fbece54c4bb..c1dd3a1d243 100644 --- a/encodings/experimental/onpair/src/compute/filter.rs +++ b/encodings/experimental/onpair/src/compute/filter.rs @@ -11,7 +11,7 @@ //! FSST-shape filter: keep `dict_bytes` + `dict_offsets` **identical** //! to the input; rebuild only `codes`, `codes_offsets`, //! `uncompressed_lengths`, and validity by walking the mask. No decode, -//! no retrain, no C++ call on the read path. +//! no retrain on the read path. use vortex_array::ArrayRef; use vortex_array::ArrayView; diff --git a/encodings/experimental/onpair/src/decode.rs b/encodings/experimental/onpair/src/decode.rs index dd434811d06..c81d891e8be 100644 --- a/encodings/experimental/onpair/src/decode.rs +++ b/encodings/experimental/onpair/src/decode.rs @@ -191,8 +191,7 @@ impl<'a> DecodeView<'a> { /// Single-pass over-copy decode of a token window into raw `dst`. /// - /// Mirrors OnPair C++ `decode_all` (and `decompress`) - /// exactly: each iteration loads one `u16` code, one `u64` dict-table + /// Each iteration loads one `u16` code, one `u64` dict-table /// entry, issues a fixed [`MAX_TOKEN_SIZE`][crate::MAX_TOKEN_SIZE] /// `copy_nonoverlapping` (which LLVM lowers to a single unaligned /// 128-bit SIMD store on x86_64 / aarch64), and advances the cursor by From 5d03b9e13745bcca6022ed015a7310b6d7691a20 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 10:15:16 +0100 Subject: [PATCH 03/27] fix Signed-off-by: Joe Isaacs --- .../experimental/onpair/tests/big_data.rs | 37 ------------------ .../tests/test_onpair_string_roundtrip.rs | 39 ------------------- 2 files changed, 76 deletions(-) diff --git a/encodings/experimental/onpair/tests/big_data.rs b/encodings/experimental/onpair/tests/big_data.rs index 0be025dcfc5..81b1fa9e10b 100644 --- a/encodings/experimental/onpair/tests/big_data.rs +++ b/encodings/experimental/onpair/tests/big_data.rs @@ -21,12 +21,9 @@ use vortex_array::accessor::ArrayAccessor; use vortex_array::arrays::ConstantArray; use vortex_array::arrays::VarBinArray; use vortex_array::arrays::VarBinViewArray; -use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt; use vortex_array::builtins::ArrayBuiltins; use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; -use vortex_array::scalar_fn::fns::like::Like; -use vortex_array::scalar_fn::fns::like::LikeOptions; use vortex_array::scalar_fn::fns::operators::Operator; use vortex_array::session::ArraySession; use vortex_onpair::DEFAULT_DICT12_CONFIG; @@ -126,38 +123,4 @@ fn smoke_100k_rows() { .into_array(); assert_eq!(eq.as_bool_typed().true_count().unwrap(), want_eq); eprintln!("eq pushdown matches reference count ({})", want_eq); - - // Prefix pushdown. - let prefix = "https://www."; - let want_prefix = strings.iter().filter(|s| s.starts_with(prefix)).count(); - let pat = ConstantArray::new(format!("{prefix}%").as_str(), n).into_array(); - let got_prefix = Like - .try_new_array(n, LikeOptions::default(), [arr_ref.clone(), pat]) - .unwrap() - .into_array() - .execute::(&mut ctx) - .unwrap() - .into_array() - .as_bool_typed() - .true_count() - .unwrap(); - assert_eq!(got_prefix, want_prefix); - eprintln!("starts_with pushdown matches reference ({})", want_prefix); - - // Contains pushdown. - let sub = "status=500"; - let want_sub = strings.iter().filter(|s| s.contains(sub)).count(); - let pat = ConstantArray::new(format!("%{sub}%").as_str(), n).into_array(); - let got_sub = Like - .try_new_array(n, LikeOptions::default(), [arr_ref.clone(), pat]) - .unwrap() - .into_array() - .execute::(&mut ctx) - .unwrap() - .into_array() - .as_bool_typed() - .true_count() - .unwrap(); - assert_eq!(got_sub, want_sub); - eprintln!("contains pushdown matches reference ({})", want_sub); } diff --git a/vortex-file/tests/test_onpair_string_roundtrip.rs b/vortex-file/tests/test_onpair_string_roundtrip.rs index 803c6869e46..f51f68deb21 100644 --- a/vortex-file/tests/test_onpair_string_roundtrip.rs +++ b/vortex-file/tests/test_onpair_string_roundtrip.rs @@ -115,45 +115,6 @@ async fn write_and_read_back(data: vortex_array::ArrayRef) -> Vec() - .expect("Struct"); - let url = strct.unmasked_field(0).clone(); - let mut ctx = SESSION.create_execution_ctx(); - let url = url.execute::(&mut ctx).expect("canon"); - url.with_iterator(|iter| { - for b in iter { - assert_eq!(b, Some(strings[row].as_bytes()), "row {row}"); - row += 1; - } - Ok::<_, vortex_error::VortexError>(()) - }) - .unwrap(); - } - assert_eq!(row, n); -} - /// Many rows → many chunks via the writer's default row_block_size. #[tokio::test] async fn single_column_many_chunks() { From 0dc843c52919a8be5b8be54f1e18004d489d54a8 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 10:49:20 +0100 Subject: [PATCH 04/27] fix Signed-off-by: Joe Isaacs --- Cargo.lock | 16 +- Cargo.toml | 2 +- encodings/experimental/onpair/src/array.rs | 166 +++++++----------- .../experimental/onpair/src/canonical.rs | 2 +- .../experimental/onpair/src/compute/cast.rs | 14 +- .../experimental/onpair/src/compute/filter.rs | 1 + .../experimental/onpair/src/compute/mod.rs | 1 + .../onpair/src/{ => compute}/slice.rs | 1 + encodings/experimental/onpair/src/decode.rs | 2 +- encodings/experimental/onpair/src/kernel.rs | 11 +- encodings/experimental/onpair/src/lib.rs | 1 - encodings/experimental/onpair/src/tests.rs | 1 + 12 files changed, 86 insertions(+), 132 deletions(-) rename encodings/experimental/onpair/src/{ => compute}/slice.rs (97%) diff --git a/Cargo.lock b/Cargo.lock index c74f1a876de..dd93bc07661 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -799,7 +799,7 @@ dependencies = [ "bitflags", "cexpr", "clang-sys", - "itertools 0.10.5", + "itertools 0.13.0", "log", "prettyplease", "proc-macro2", @@ -1317,7 +1317,7 @@ checksum = "af491d569909a7e4dee0ad7db7f5341fef5c614d5b8ec8cf765732aba3cff681" dependencies = [ "serde", "termcolor", - "unicode-width 0.1.14", + "unicode-width 0.2.2", ] [[package]] @@ -5725,7 +5725,9 @@ checksum = "cfe21416a02c693fb9f980befcb230ecc70b0b3d1cc4abf88b9675c4c1457f0c" [[package]] name = "onpair" -version = "0.0.1" +version = "0.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b630b38fb60f69bb1d6125b08cda2c93b350e6c90724af37c66dfb83e40c85e" dependencies = [ "hashbrown 0.16.1", "rand 0.9.4", @@ -6145,9 +6147,9 @@ version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "044b1fa4f259f4df9ad5078e587b208f5d288a25407575fcddb9face30c7c692" dependencies = [ - "rand 0.8.6", + "rand 0.9.4", "socket2", - "thiserror 1.0.69", + "thiserror 2.0.18", ] [[package]] @@ -6360,7 +6362,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" dependencies = [ "heck", - "itertools 0.10.5", + "itertools 0.14.0", "log", "multimap", "petgraph", @@ -6392,7 +6394,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" dependencies = [ "anyhow", - "itertools 0.10.5", + "itertools 0.14.0", "proc-macro2", "quote", "syn 2.0.117", diff --git a/Cargo.toml b/Cargo.toml index a696d96d8ae..a9253ec08dc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -190,7 +190,7 @@ num_enum = { version = "0.7.3", default-features = false } object_store = { version = "0.13.1", default-features = false } once_cell = "1.21" oneshot = { version = "0.2.0", features = ["async"] } -onpair = { path = "../onpair" } +onpair = { version = "0.0.2" } opentelemetry = "0.32.0" opentelemetry-otlp = "0.32.0" opentelemetry_sdk = "0.32.0" diff --git a/encodings/experimental/onpair/src/array.rs b/encodings/experimental/onpair/src/array.rs index 9f83427067e..6183406cfae 100644 --- a/encodings/experimental/onpair/src/array.rs +++ b/encodings/experimental/onpair/src/array.rs @@ -13,14 +13,13 @@ use vortex_array::ArrayHash; use vortex_array::ArrayId; use vortex_array::ArrayParts; use vortex_array::ArrayRef; -use vortex_array::ArraySlots; use vortex_array::ArrayView; use vortex_array::Canonical; use vortex_array::ExecutionCtx; use vortex_array::ExecutionResult; use vortex_array::IntoArray; use vortex_array::Precision; -use vortex_array::TypedArrayRef; +use vortex_array::array_slots; use vortex_array::buffer::BufferHandle; use vortex_array::builders::ArrayBuilder; use vortex_array::builders::VarBinViewBuilder; @@ -28,7 +27,6 @@ use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; use vortex_array::dtype::PType; use vortex_array::serde::ArrayChildren; -use vortex_array::smallvec::smallvec; use vortex_array::validity::Validity; use vortex_array::vtable::VTable; use vortex_array::vtable::ValidityVTable; @@ -58,20 +56,12 @@ pub type OnPairArray = Array; /// * Buffer 0 — `dict_bytes`: the dictionary blob built by the OnPair trainer, /// padded with [`MAX_TOKEN_SIZE`][crate::MAX_TOKEN_SIZE] trailing zero /// bytes so the over-copy decoder can read 16 bytes past the last token. -/// * Slot 0 — `dict_offsets`: `PrimitiveArray`, len `dict_size + 1`. -/// * Slot 1 — `codes`: `PrimitiveArray`. Each value only uses its low -/// `bits` bits; downstream `FastLanes::BitPacking` losslessly shrinks -/// the child to exactly `bits`-bit codes on disk. -/// * Slot 2 — `codes_offsets`: `PrimitiveArray`, len `num_rows + 1`. -/// FoR / RunEnd / etc. apply naturally via the cascading compressor. -/// * Slot 3 — `uncompressed_lengths`: integer `PrimitiveArray`, len -/// `num_rows`. Used to size the canonical output buffer. -/// * Slot 4 — optional validity child. +/// * Slots — see [`OnPairSlots`]. /// -/// All three integer slot children flow through the standard -/// `compress_child` pipeline (see `vortex-btrblocks::schemes::string:: -/// OnPairScheme`), so any encoding registered with the compressor can -/// re-encode them — exactly the same shape as FSST's `codes` `VarBinArray`. +/// The four integer slot children flow through the standard `compress_child` +/// pipeline (see `vortex-btrblocks::schemes::string::OnPairScheme`), so any +/// encoding registered with the compressor can re-encode them — exactly the +/// same shape as FSST's `codes` `VarBinArray`. #[derive(Clone, prost::Message)] pub struct OnPairMetadata { /// Width of the per-row primitive `uncompressed_lengths` child. @@ -82,8 +72,9 @@ pub struct OnPairMetadata { #[prost(uint32, tag = "2")] pub bits: u32, /// Number of dictionary tokens. `dict_offsets` has length `dict_size + 1`. - #[prost(uint64, tag = "3")] - pub dict_size: u64, + /// Bounded by `2^bits ≤ 2^16 = 65_536`, so `u32` is comfortably wide. + #[prost(uint32, tag = "3")] + pub dict_size: u32, /// Total number of tokens across all rows. `codes` has this length; /// `codes_offsets.last() == total_tokens`. #[prost(uint64, tag = "4")] @@ -108,20 +99,24 @@ impl OnPairMetadata { } } -/// Slot indices on the outer [`Array`]. -pub(crate) const DICT_OFFSETS_SLOT: usize = 0; -pub(crate) const CODES_SLOT: usize = 1; -pub(crate) const CODES_OFFSETS_SLOT: usize = 2; -pub(crate) const UNCOMPRESSED_LENGTHS_SLOT: usize = 3; -pub(crate) const VALIDITY_SLOT: usize = 4; -pub(crate) const NUM_SLOTS: usize = 5; -pub(crate) const SLOT_NAMES: [&str; NUM_SLOTS] = [ - "dict_offsets", - "codes", - "codes_offsets", - "uncompressed_lengths", - "validity", -]; +#[array_slots(OnPair)] +pub struct OnPairSlots { + /// `PrimitiveArray`, length `dict_size + 1`. Cascading compressor may + /// narrow the ptype to U16/U8. + pub dict_offsets: ArrayRef, + /// `PrimitiveArray`. Each value only uses its low `bits` bits; + /// downstream `FastLanes::BitPacking` losslessly shrinks the child to + /// exactly `bits`-bit codes on disk. + pub codes: ArrayRef, + /// `PrimitiveArray`, length `num_rows + 1`. FoR / RunEnd / etc. apply + /// naturally via the cascading compressor. + pub codes_offsets: ArrayRef, + /// Integer `PrimitiveArray`, length `num_rows`. Used to size the canonical + /// output buffer. + pub uncompressed_lengths: ArrayRef, + /// Optional validity child for the outer string column. + pub validity: Option, +} /// Inner data for an OnPair-encoded array. /// @@ -232,13 +227,14 @@ impl OnPair { )?; let len = uncompressed_lengths.len(); let data = OnPairData::new(dict_bytes, bits, len); - let slots: ArraySlots = smallvec![ - Some(dict_offsets), - Some(codes), - Some(codes_offsets), - Some(uncompressed_lengths), - validity_to_child(&validity, len), - ]; + let slots = OnPairSlots { + dict_offsets, + codes, + codes_offsets, + uncompressed_lengths, + validity: validity_to_child(&validity, len), + } + .into_slots(); Ok(unsafe { Array::from_parts_unchecked(ArrayParts::new(OnPair, dtype, len, data).with_slots(slots)) }) @@ -257,13 +253,14 @@ impl OnPair { ) -> OnPairArray { let len = uncompressed_lengths.len(); let data = OnPairData::new(dict_bytes, bits, len); - let slots: ArraySlots = smallvec![ - Some(dict_offsets), - Some(codes), - Some(codes_offsets), - Some(uncompressed_lengths), - validity_to_child(&validity, len), - ]; + let slots = OnPairSlots { + dict_offsets, + codes, + codes_offsets, + uncompressed_lengths, + validity: validity_to_child(&validity, len), + } + .into_slots(); unsafe { Array::from_parts_unchecked(ArrayParts::new(OnPair, dtype, len, data).with_slots(slots)) } @@ -323,27 +320,16 @@ impl VTable for OnPair { len: usize, slots: &[Option], ) -> VortexResult<()> { - let dict_offsets = slots[DICT_OFFSETS_SLOT] - .as_ref() - .ok_or_else(|| vortex_err!("OnPairArray dict_offsets slot missing"))?; - let codes = slots[CODES_SLOT] - .as_ref() - .ok_or_else(|| vortex_err!("OnPairArray codes slot missing"))?; - let codes_offsets = slots[CODES_OFFSETS_SLOT] - .as_ref() - .ok_or_else(|| vortex_err!("OnPairArray codes_offsets slot missing"))?; - let uncompressed_lengths = slots[UNCOMPRESSED_LENGTHS_SLOT] - .as_ref() - .ok_or_else(|| vortex_err!("OnPairArray uncompressed_lengths slot missing"))?; + let s = OnPairSlotsView::from_slots(slots); validate_parts( dtype, - dict_offsets, - codes, - codes_offsets, - uncompressed_lengths, + s.dict_offsets, + s.codes, + s.codes_offsets, + s.uncompressed_lengths, data.bits, )?; - if uncompressed_lengths.len() != len { + if s.uncompressed_lengths.len() != len { vortex_bail!(InvalidArgument: "uncompressed_lengths must have same len as outer array"); } if data.len != len { @@ -374,7 +360,8 @@ impl VTable for OnPair { array: ArrayView<'_, Self>, _session: &VortexSession, ) -> VortexResult>> { - let dict_size = array.dict_offsets().len().saturating_sub(1) as u64; + let dict_size = u32::try_from(array.dict_offsets().len().saturating_sub(1)) + .map_err(|_| vortex_err!("OnPair dict_size exceeds u32"))?; let total_tokens = array.codes().len() as u64; Ok(Some( OnPairMetadata { @@ -408,8 +395,7 @@ impl VTable for OnPair { // Slot children. We pass `usize::MAX` for slots whose length we // don't know up front (`dict_offsets` and `codes`). `codes_offsets` // has known length `len + 1`. - let dict_offsets_len = usize::try_from(metadata.dict_size + 1) - .map_err(|_| vortex_err!("dict_size {} overflows usize", metadata.dict_size))?; + let dict_offsets_len = metadata.dict_size as usize + 1; let total_tokens = usize::try_from(metadata.total_tokens) .map_err(|_| vortex_err!("total_tokens {} overflows usize", metadata.total_tokens))?; // The cascading compressor may have narrowed any of these integer @@ -453,18 +439,19 @@ impl VTable for OnPair { }; let data = OnPairData::new(buffers[0].clone(), metadata.bits, len); - let slots: ArraySlots = smallvec![ - Some(dict_offsets), - Some(codes), - Some(codes_offsets), - Some(uncompressed_lengths), - validity_to_child(&validity, len), - ]; + let slots = OnPairSlots { + dict_offsets, + codes, + codes_offsets, + uncompressed_lengths, + validity: validity_to_child(&validity, len), + } + .into_slots(); Ok(ArrayParts::new(self.clone(), dtype.clone(), len, data).with_slots(slots)) } fn slot_name(_array: ArrayView<'_, Self>, idx: usize) -> String { - SLOT_NAMES[idx].to_string() + OnPairSlots::NAMES[idx].to_string() } fn execute(array: Array, ctx: &mut ExecutionCtx) -> VortexResult { @@ -521,41 +508,20 @@ impl VTable for OnPair { impl ValidityVTable for OnPair { fn validity(array: ArrayView<'_, OnPair>) -> VortexResult { Ok(child_to_validity( - array.slots()[VALIDITY_SLOT].as_ref(), + array.slots()[OnPairSlots::VALIDITY].as_ref(), array.dtype().nullability(), )) } } -/// Convenience extension trait. Slot accessors live here; methods reachable -/// through `OnPairData` flow via the `ArrayView -> Deref` chain. -pub trait OnPairArrayExt: TypedArrayRef { - fn dict_offsets(&self) -> &ArrayRef { - self.as_ref().slots()[DICT_OFFSETS_SLOT] - .as_ref() - .unwrap_or_else(|| vortex_panic!("OnPairArray dict_offsets slot missing")) - } - fn codes(&self) -> &ArrayRef { - self.as_ref().slots()[CODES_SLOT] - .as_ref() - .unwrap_or_else(|| vortex_panic!("OnPairArray codes slot missing")) - } - fn codes_offsets(&self) -> &ArrayRef { - self.as_ref().slots()[CODES_OFFSETS_SLOT] - .as_ref() - .unwrap_or_else(|| vortex_panic!("OnPairArray codes_offsets slot missing")) - } - fn uncompressed_lengths(&self) -> &ArrayRef { - self.as_ref().slots()[UNCOMPRESSED_LENGTHS_SLOT] - .as_ref() - .unwrap_or_else(|| vortex_panic!("OnPairArray uncompressed_lengths slot missing")) - } +/// Convenience methods on top of the macro-generated [`OnPairArraySlotsExt`]. +pub trait OnPairArrayExt: OnPairArraySlotsExt { fn array_validity(&self) -> Validity { child_to_validity( - self.as_ref().slots()[VALIDITY_SLOT].as_ref(), + self.as_ref().slots()[OnPairSlots::VALIDITY].as_ref(), self.as_ref().dtype().nullability(), ) } } -impl> OnPairArrayExt for T {} +impl OnPairArrayExt for T {} diff --git a/encodings/experimental/onpair/src/canonical.rs b/encodings/experimental/onpair/src/canonical.rs index 368c5ab0b7a..9837d63a35f 100644 --- a/encodings/experimental/onpair/src/canonical.rs +++ b/encodings/experimental/onpair/src/canonical.rs @@ -22,7 +22,7 @@ use vortex_buffer::ByteBufferMut; use vortex_error::VortexResult; use crate::OnPair; -use crate::OnPairArrayExt; +use crate::OnPairArraySlotsExt; use crate::decode::OwnedDecodeInputs; pub(super) fn canonicalize_onpair( diff --git a/encodings/experimental/onpair/src/compute/cast.rs b/encodings/experimental/onpair/src/compute/cast.rs index 27b4ad378c7..93e1fdd8f8a 100644 --- a/encodings/experimental/onpair/src/compute/cast.rs +++ b/encodings/experimental/onpair/src/compute/cast.rs @@ -3,15 +3,13 @@ use vortex_array::ArrayRef; use vortex_array::ArrayView; -use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::dtype::DType; -use vortex_array::scalar_fn::fns::cast::CastKernel; use vortex_array::scalar_fn::fns::cast::CastReduce; use vortex_error::VortexResult; use crate::OnPair; -use crate::OnPairArrayExt; +use crate::OnPairArraySlotsExt; /// Cast between `Utf8` and `Binary` (or adjust nullability) without touching /// any of the encoded payload — we only rewrap into a new outer DType. @@ -43,13 +41,3 @@ impl CastReduce for OnPair { )) } } - -impl CastKernel for OnPair { - fn cast( - array: ArrayView<'_, Self>, - dtype: &DType, - _ctx: &mut ExecutionCtx, - ) -> VortexResult> { - ::cast(array, dtype) - } -} diff --git a/encodings/experimental/onpair/src/compute/filter.rs b/encodings/experimental/onpair/src/compute/filter.rs index c1dd3a1d243..736528f956a 100644 --- a/encodings/experimental/onpair/src/compute/filter.rs +++ b/encodings/experimental/onpair/src/compute/filter.rs @@ -27,6 +27,7 @@ use vortex_mask::Mask; use crate::OnPair; use crate::OnPairArrayExt; +use crate::OnPairArraySlotsExt; impl FilterKernel for OnPair { #[expect(clippy::cognitive_complexity, clippy::cast_possible_truncation)] diff --git a/encodings/experimental/onpair/src/compute/mod.rs b/encodings/experimental/onpair/src/compute/mod.rs index e33c49b80f1..4cb15868625 100644 --- a/encodings/experimental/onpair/src/compute/mod.rs +++ b/encodings/experimental/onpair/src/compute/mod.rs @@ -3,3 +3,4 @@ mod cast; mod filter; +mod slice; diff --git a/encodings/experimental/onpair/src/slice.rs b/encodings/experimental/onpair/src/compute/slice.rs similarity index 97% rename from encodings/experimental/onpair/src/slice.rs rename to encodings/experimental/onpair/src/compute/slice.rs index 48f3d6b8d16..fcfebf413bf 100644 --- a/encodings/experimental/onpair/src/slice.rs +++ b/encodings/experimental/onpair/src/compute/slice.rs @@ -17,6 +17,7 @@ use vortex_error::VortexResult; use crate::OnPair; use crate::OnPairArrayExt; +use crate::OnPairArraySlotsExt; impl SliceReduce for OnPair { fn slice(array: ArrayView<'_, Self>, range: Range) -> VortexResult> { diff --git a/encodings/experimental/onpair/src/decode.rs b/encodings/experimental/onpair/src/decode.rs index c81d891e8be..a7697977a70 100644 --- a/encodings/experimental/onpair/src/decode.rs +++ b/encodings/experimental/onpair/src/decode.rs @@ -22,7 +22,7 @@ use vortex_buffer::ByteBuffer; use vortex_error::VortexResult; use crate::OnPair; -use crate::OnPairArrayExt; +use crate::OnPairArraySlotsExt; /// Materialised, host-resident copies of every read path's input. /// diff --git a/encodings/experimental/onpair/src/kernel.rs b/encodings/experimental/onpair/src/kernel.rs index e8c891f5875..99fb5bf2d14 100644 --- a/encodings/experimental/onpair/src/kernel.rs +++ b/encodings/experimental/onpair/src/kernel.rs @@ -3,14 +3,9 @@ use vortex_array::arrays::filter::FilterExecuteAdaptor; use vortex_array::kernel::ParentKernelSet; -use vortex_array::scalar_fn::fns::cast::CastExecuteAdaptor; use crate::OnPair; -// TODO: implement TakeExecute for OnPair to add a TakeExecuteAdaptor here -// (matches the FSST pattern; would dispatch take on the codes child + reuse -// the dictionary, mirroring the slice path). -pub(super) const PARENT_KERNELS: ParentKernelSet = ParentKernelSet::new(&[ - ParentKernelSet::lift(&CastExecuteAdaptor(OnPair)), - ParentKernelSet::lift(&FilterExecuteAdaptor(OnPair)), -]); +// TODO: implement TakeExecute for OnPair +pub(super) const PARENT_KERNELS: ParentKernelSet = + ParentKernelSet::new(&[ParentKernelSet::lift(&FilterExecuteAdaptor(OnPair))]); diff --git a/encodings/experimental/onpair/src/lib.rs b/encodings/experimental/onpair/src/lib.rs index 7b247c71607..b07ee395bf9 100644 --- a/encodings/experimental/onpair/src/lib.rs +++ b/encodings/experimental/onpair/src/lib.rs @@ -18,7 +18,6 @@ pub mod decode; mod kernel; mod ops; mod rules; -mod slice; #[cfg(test)] mod tests; diff --git a/encodings/experimental/onpair/src/tests.rs b/encodings/experimental/onpair/src/tests.rs index 2a2abb62d80..e1cec96a1c2 100644 --- a/encodings/experimental/onpair/src/tests.rs +++ b/encodings/experimental/onpair/src/tests.rs @@ -23,6 +23,7 @@ use vortex_session::VortexSession; use crate::OnPair; use crate::OnPairArrayExt; +use crate::OnPairArraySlotsExt; use crate::OnPairMetadata; use crate::compress::DEFAULT_DICT12_CONFIG; use crate::compress::onpair_compress; From 4ca21b874805641fd345f61cc7dbff31b1802a21 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 10:53:53 +0100 Subject: [PATCH 05/27] fix Signed-off-by: Joe Isaacs --- vortex-btrblocks/Cargo.toml | 6 +- vortex-btrblocks/src/schemes/string.rs | 361 ++++++++--------- .../tests/test_onpair_string_roundtrip.rs | 365 ------------------ 3 files changed, 192 insertions(+), 540 deletions(-) delete mode 100644 vortex-file/tests/test_onpair_string_roundtrip.rs diff --git a/vortex-btrblocks/Cargo.toml b/vortex-btrblocks/Cargo.toml index 493c1684318..1adb6508828 100644 --- a/vortex-btrblocks/Cargo.toml +++ b/vortex-btrblocks/Cargo.toml @@ -49,7 +49,11 @@ vortex-session = { workspace = true } [features] # This feature enabled unstable encodings for which we don't guarantee stability. -unstable_encodings = ["dep:vortex-tensor", "dep:vortex-onpair", "vortex-zstd?/unstable_encodings"] +unstable_encodings = [ + "dep:vortex-tensor", + "dep:vortex-onpair", + "vortex-zstd?/unstable_encodings", +] pco = ["dep:pco", "dep:vortex-pco"] zstd = ["dep:vortex-zstd"] diff --git a/vortex-btrblocks/src/schemes/string.rs b/vortex-btrblocks/src/schemes/string.rs index 753a1deb8f4..2b73d3daab1 100644 --- a/vortex-btrblocks/src/schemes/string.rs +++ b/vortex-btrblocks/src/schemes/string.rs @@ -21,21 +21,11 @@ use vortex_fsst::FSST; use vortex_fsst::FSSTArrayExt; use vortex_fsst::fsst_compress; use vortex_fsst::fsst_train_compressor; -#[cfg(feature = "unstable_encodings")] -use vortex_onpair::DEFAULT_DICT12_CONFIG; -#[cfg(feature = "unstable_encodings")] -use vortex_onpair::OnPair; -#[cfg(feature = "unstable_encodings")] -use vortex_onpair::OnPairArrayExt; -#[cfg(feature = "unstable_encodings")] -use vortex_onpair::onpair_compress; use vortex_sparse::Sparse; use vortex_sparse::SparseExt as _; use super::integer::IntDictScheme; use super::integer::SparseScheme as IntSparseScheme; -#[cfg(feature = "unstable_encodings")] -use super::integer::try_compress_delta; use crate::ArrayAndStats; use crate::CascadingCompressor; use crate::CompressorContext; @@ -50,17 +40,8 @@ use crate::SchemeExt; #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct FSSTScheme; -/// OnPair short-string compression (dict-12). -/// -/// A default string-fragmentation scheme (alongside [`FSSTScheme`]) — targets -/// large columns of short-to-medium strings with high lexical overlap, like -/// URLs or log lines. Uses a learned dictionary of frequent adjacent substrings -/// (built by the OnPair trainer at compress time) and 12-bit token codes stored -/// as a u16 child, with offsets / uncompressed-lengths flowing through the -/// cascading compressor like any other primitive children. #[cfg(feature = "unstable_encodings")] -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct OnPairScheme; +pub use onpair::OnPairScheme; /// Sparse encoding for null-dominated arrays. /// @@ -164,160 +145,6 @@ impl Scheme for FSSTScheme { } } -#[cfg(feature = "unstable_encodings")] -impl Scheme for OnPairScheme { - fn scheme_name(&self) -> &'static str { - "vortex.string.onpair" - } - - fn matches(&self, canonical: &Canonical) -> bool { - is_utf8_string(canonical) - } - - /// One slot child: `uncompressed_lengths`. The dictionary blob, dictionary - /// offsets, codes (u16), and codes offsets all live as raw byte buffers - /// on the OnPair array — they're not primitive slot children, so the - /// cascading compressor doesn't recompress them. Codes intentionally - /// 4 primitive slot children flow through the cascading compressor: - /// `dict_offsets` (u32 → typically `FoR`/`BitPacked`), `codes` (u16 → - /// `FastLanes::BitPacked` to exactly `bits` = 12 by default), - /// `codes_offsets` (u32 → `FoR`), `uncompressed_lengths` (i32 → narrow - /// + `FoR`). Validity stays untouched. - fn num_children(&self) -> usize { - 4 - } - - fn expected_compression_ratio( - &self, - _data: &ArrayAndStats, - _compress_ctx: CompressorContext, - _exec_ctx: &mut ExecutionCtx, - ) -> CompressionEstimate { - CompressionEstimate::Deferred(DeferredEstimate::Sample) - } - - fn compress( - &self, - compressor: &CascadingCompressor, - data: &ArrayAndStats, - compress_ctx: CompressorContext, - exec_ctx: &mut ExecutionCtx, - ) -> VortexResult { - let utf8 = data.array_as_utf8().into_owned(); - let onpair_array = onpair_compress(&utf8, utf8.len(), utf8.dtype(), DEFAULT_DICT12_CONFIG)?; - - let dict_offsets = compress_offsets_child( - compressor, - onpair_array.dict_offsets(), - &compress_ctx, - self.id(), - 0, - exec_ctx, - )?; - let codes = compress_primitive_child( - compressor, - onpair_array.codes(), - &compress_ctx, - self.id(), - 1, - exec_ctx, - )?; - let codes_offsets = compress_offsets_child( - compressor, - onpair_array.codes_offsets(), - &compress_ctx, - self.id(), - 2, - exec_ctx, - )?; - let uncompressed_lengths = compress_primitive_child( - compressor, - onpair_array.uncompressed_lengths(), - &compress_ctx, - self.id(), - 3, - exec_ctx, - )?; - - Ok(OnPair::try_new( - onpair_array.dtype().clone(), - onpair_array.dict_bytes_handle().clone(), - dict_offsets, - codes, - codes_offsets, - uncompressed_lengths, - onpair_array.array_validity(), - onpair_array.bits(), - )? - .into_array()) - } -} - -/// Narrow a primitive child to its tightest int type, then forward it to -/// the cascading compressor. -#[cfg(feature = "unstable_encodings")] -fn compress_primitive_child( - compressor: &CascadingCompressor, - child: &ArrayRef, - compress_ctx: &CompressorContext, - scheme_id: vortex_compressor::scheme::SchemeId, - child_idx: usize, - exec_ctx: &mut ExecutionCtx, -) -> VortexResult { - let narrowed = child - .clone() - .execute::(exec_ctx)? - .narrow(exec_ctx)? - .into_array(); - compressor.compress_child(&narrowed, compress_ctx, scheme_id, child_idx, exec_ctx) -} - -/// Minimum child length before delta is even attempted. Delta carries fixed -/// overhead (a separate `bases` array plus FastLanes' 1024-element lane -/// packing), so on short children it can only lose. -#[cfg(feature = "unstable_encodings")] -const OFFSETS_DELTA_MIN_LEN: usize = 2048; - -/// Compress a monotonic offsets child. For children of at least -/// [`OFFSETS_DELTA_MIN_LEN`] it tries both the normal cascading path and a -/// delta path and keeps whichever produces fewer bytes; shorter children skip -/// delta entirely. `dict_offsets` and `codes_offsets` are cumulative -/// (monotonic), so delta (per-entry deltas) usually packs much tighter than -/// FoR+bitpacking over the full range. -#[cfg(feature = "unstable_encodings")] -fn compress_offsets_child( - compressor: &CascadingCompressor, - child: &ArrayRef, - compress_ctx: &CompressorContext, - scheme_id: vortex_compressor::scheme::SchemeId, - child_idx: usize, - exec_ctx: &mut ExecutionCtx, -) -> VortexResult { - let narrowed = child - .clone() - .execute::(exec_ctx)? - .narrow(exec_ctx)? - .into_array(); - let plain = - compressor.compress_child(&narrowed, compress_ctx, scheme_id, child_idx, exec_ctx)?; - if narrowed.len() < OFFSETS_DELTA_MIN_LEN { - return Ok(plain); - } - let delta = try_compress_delta( - compressor, - &narrowed, - compress_ctx, - scheme_id, - child_idx, - exec_ctx, - )?; - if delta.nbytes() < plain.nbytes() { - Ok(delta) - } else { - Ok(plain) - } -} - impl Scheme for NullDominatedSparseScheme { fn scheme_name(&self) -> &'static str { "vortex.string.sparse" @@ -473,6 +300,192 @@ impl Scheme for ZstdBuffersScheme { } } +#[cfg(feature = "unstable_encodings")] +mod onpair { + use vortex_array::ArrayRef; + use vortex_array::Canonical; + use vortex_array::ExecutionCtx; + use vortex_array::IntoArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::arrays::primitive::PrimitiveArrayExt; + use vortex_compressor::estimate::CompressionEstimate; + use vortex_compressor::estimate::DeferredEstimate; + use vortex_compressor::scheme::SchemeId; + use vortex_error::VortexResult; + use vortex_onpair::DEFAULT_DICT12_CONFIG; + use vortex_onpair::OnPair; + use vortex_onpair::OnPairArrayExt; + use vortex_onpair::OnPairArraySlotsExt; + use vortex_onpair::onpair_compress; + + use super::is_utf8_string; + use crate::ArrayAndStats; + use crate::CascadingCompressor; + use crate::CompressorContext; + use crate::Scheme; + use crate::SchemeExt; + use crate::schemes::integer::try_compress_delta; + + /// OnPair short-string compression (dict-12). + /// + /// A default string-fragmentation scheme (alongside [`super::FSSTScheme`]) — + /// targets large columns of short-to-medium strings with high lexical + /// overlap, like URLs or log lines. Uses a learned dictionary of frequent + /// adjacent substrings (built by the OnPair trainer at compress time) and + /// 12-bit token codes stored as a u16 child, with offsets / + /// uncompressed-lengths flowing through the cascading compressor like any + /// other primitive children. + #[derive(Debug, Copy, Clone, PartialEq, Eq)] + pub struct OnPairScheme; + + impl Scheme for OnPairScheme { + fn scheme_name(&self) -> &'static str { + "vortex.string.onpair" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) + } + + /// 4 primitive slot children flow through the cascading compressor: + /// `dict_offsets` (u32 → typically `FoR`/`BitPacked`), `codes` (u16 → + /// `FastLanes::BitPacked` to exactly `bits` = 12 by default), + /// `codes_offsets` (u32 → `FoR`), `uncompressed_lengths` (i32 → narrow + /// + `FoR`). Validity stays untouched. + fn num_children(&self) -> usize { + 4 + } + + fn expected_compression_ratio( + &self, + _data: &ArrayAndStats, + _compress_ctx: CompressorContext, + _exec_ctx: &mut ExecutionCtx, + ) -> CompressionEstimate { + CompressionEstimate::Deferred(DeferredEstimate::Sample) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &ArrayAndStats, + compress_ctx: CompressorContext, + exec_ctx: &mut ExecutionCtx, + ) -> VortexResult { + let utf8 = data.array_as_utf8().into_owned(); + let onpair_array = + onpair_compress(&utf8, utf8.len(), utf8.dtype(), DEFAULT_DICT12_CONFIG)?; + + let dict_offsets = compress_offsets_child( + compressor, + onpair_array.dict_offsets(), + &compress_ctx, + self.id(), + 0, + exec_ctx, + )?; + let codes = compress_primitive_child( + compressor, + onpair_array.codes(), + &compress_ctx, + self.id(), + 1, + exec_ctx, + )?; + let codes_offsets = compress_offsets_child( + compressor, + onpair_array.codes_offsets(), + &compress_ctx, + self.id(), + 2, + exec_ctx, + )?; + let uncompressed_lengths = compress_primitive_child( + compressor, + onpair_array.uncompressed_lengths(), + &compress_ctx, + self.id(), + 3, + exec_ctx, + )?; + + Ok(OnPair::try_new( + onpair_array.dtype().clone(), + onpair_array.dict_bytes_handle().clone(), + dict_offsets, + codes, + codes_offsets, + uncompressed_lengths, + onpair_array.array_validity(), + onpair_array.bits(), + )? + .into_array()) + } + } + + /// Narrow a primitive child to its tightest int type, then forward it to + /// the cascading compressor. + fn compress_primitive_child( + compressor: &CascadingCompressor, + child: &ArrayRef, + compress_ctx: &CompressorContext, + scheme_id: SchemeId, + child_idx: usize, + exec_ctx: &mut ExecutionCtx, + ) -> VortexResult { + let narrowed = child + .clone() + .execute::(exec_ctx)? + .narrow(exec_ctx)? + .into_array(); + compressor.compress_child(&narrowed, compress_ctx, scheme_id, child_idx, exec_ctx) + } + + /// Minimum child length before delta is even attempted. Delta carries fixed + /// overhead (a separate `bases` array plus FastLanes' 1024-element lane + /// packing), so on short children it can only lose. + const OFFSETS_DELTA_MIN_LEN: usize = 2048; + + /// Compress a monotonic offsets child. For children of at least + /// [`OFFSETS_DELTA_MIN_LEN`] it tries both the normal cascading path and a + /// delta path and keeps whichever produces fewer bytes; shorter children + /// skip delta entirely. `dict_offsets` and `codes_offsets` are cumulative + /// (monotonic), so delta (per-entry deltas) usually packs much tighter than + /// FoR+bitpacking over the full range. + fn compress_offsets_child( + compressor: &CascadingCompressor, + child: &ArrayRef, + compress_ctx: &CompressorContext, + scheme_id: SchemeId, + child_idx: usize, + exec_ctx: &mut ExecutionCtx, + ) -> VortexResult { + let narrowed = child + .clone() + .execute::(exec_ctx)? + .narrow(exec_ctx)? + .into_array(); + let plain = + compressor.compress_child(&narrowed, compress_ctx, scheme_id, child_idx, exec_ctx)?; + if narrowed.len() < OFFSETS_DELTA_MIN_LEN { + return Ok(plain); + } + let delta = try_compress_delta( + compressor, + &narrowed, + compress_ctx, + scheme_id, + child_idx, + exec_ctx, + )?; + if delta.nbytes() < plain.nbytes() { + Ok(delta) + } else { + Ok(plain) + } + } +} + #[cfg(test)] mod tests { use std::sync::LazyLock; diff --git a/vortex-file/tests/test_onpair_string_roundtrip.rs b/vortex-file/tests/test_onpair_string_roundtrip.rs deleted file mode 100644 index f51f68deb21..00000000000 --- a/vortex-file/tests/test_onpair_string_roundtrip.rs +++ /dev/null @@ -1,365 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors -// -//! Round-trip stress tests for OnPair through the full Vortex file writer + -//! reader. Mirrors the call shape `vortex-bench/src/conversions.rs` uses and -//! the multi-column, many-chunk pattern of TPC-H tables (`supplier_0.vortex` -//! is the file from which CI surfaced -//! `Misaligned buffer cannot be used to build PrimitiveArray of u32`). - -#![cfg(feature = "unstable_encodings")] -#![expect( - clippy::cast_possible_truncation, - clippy::tests_outside_test_module, - clippy::redundant_clone -)] - -use std::sync::LazyLock; - -use futures::StreamExt; -use futures::pin_mut; -use vortex_array::IntoArray; -use vortex_array::VortexSessionExecute; -use vortex_array::accessor::ArrayAccessor; -use vortex_array::aggregate_fn::session::AggregateFnSession; -use vortex_array::arrays::PrimitiveArray; -use vortex_array::arrays::StructArray; -use vortex_array::arrays::VarBinViewArray; -use vortex_array::arrays::struct_::StructArrayExt; -use vortex_array::dtype::DType; -use vortex_array::dtype::FieldNames; -use vortex_array::dtype::Nullability; -use vortex_array::dtype::session::DTypeSession; -use vortex_array::optimizer::kernels::ArrayKernels; -use vortex_array::scalar_fn::session::ScalarFnSession; -use vortex_array::session::ArraySession; -use vortex_array::validity::Validity; -use vortex_buffer::ByteBuffer; -use vortex_file::OpenOptionsSessionExt; -use vortex_file::WriteOptionsSessionExt; -use vortex_io::session::RuntimeSession; -use vortex_layout::session::LayoutSession; -use vortex_session::VortexSession; - -/// Full default Vortex session — the same set of sub-sessions -/// `vortex::VortexSession::default()` would install, plus -/// `register_default_encodings`. Built inline here because `vortex-file` -/// can't depend on the umbrella `vortex` crate (it's the other way round). -static SESSION: LazyLock = LazyLock::new(|| { - let session = VortexSession::empty() - .with::() - .with::() - .with::() - .with::() - .with::() - .with::() - .with::(); - vortex_file::register_default_encodings(&session); - session -}); - -fn corpus(n: usize, offset: u64) -> Vec { - let templates: &[&str] = &[ - "https://www.example.com/products/{id}", - "https://cdn.example.com/img/{id}.webp", - "https://api.example.com/v2/orders/{id}", - "https://www.example.com/users/{id}/profile", - "INFO request_id={id} status=200 method=GET", - "WARN request_id={id} status=429 method=POST", - "ERROR request_id={id} status=500 method=PUT", - ]; - let mut out = Vec::with_capacity(n); - let mut state = 0x9e37_79b9_7f4a_7c15_u64.wrapping_add(offset); - for _ in 0..n { - state = state - .wrapping_mul(6364136223846793005) - .wrapping_add(1442695040888963407); - let pick = (state as usize) % templates.len(); - let id = state as u32; - out.push(templates[pick].replace("{id}", &format!("{id:08x}"))); - } - out -} - -/// Write `data` to an in-memory `Vec` using the **full default Vortex -/// compressor** (`WriteStrategyBuilder::default()` = -/// `BtrBlocksCompressor::default()` cascading through every registered -/// scheme, including OnPair), then open the resulting bytes via -/// `OpenOptions::open_buffer` and stream every chunk back. -async fn write_and_read_back(data: vortex_array::ArrayRef) -> Vec { - // `write_options()` builds a `VortexWriteOptions` whose `strategy` is - // `WriteStrategyBuilder::default().build()` — the same path `vortex-bench` - // uses for Parquet → Vortex conversion. No custom strategy injected. - let mut bytes = Vec::new(); - SESSION - .write_options() - .write(&mut bytes, data.to_array_stream()) - .await - .expect("write Vortex file"); - - // Read back from the in-memory byte buffer; no disk, no FS. - let bytes = ByteBuffer::from(bytes); - let vxf = SESSION.open_options().open_buffer(bytes).expect("open"); - - let stream = vxf - .scan() - .expect("scan") - .into_stream() - .expect("into_stream"); - pin_mut!(stream); - - let mut chunks = Vec::new(); - while let Some(chunk) = stream.next().await { - chunks.push(chunk.expect("chunk")); - } - chunks -} - -/// Many rows → many chunks via the writer's default row_block_size. -#[tokio::test] -async fn single_column_many_chunks() { - let n = 50_000usize; - let strings = corpus(n, 0); - let str_array = VarBinViewArray::from_iter( - strings.iter().map(|s| Some(s.as_str())), - DType::Utf8(Nullability::NonNullable), - ) - .into_array(); - let data = StructArray::new( - FieldNames::from(["url"]), - vec![str_array], - n, - Validity::NonNullable, - ) - .into_array(); - - let chunks = write_and_read_back(data).await; - let mut row = 0; - for chunk in chunks { - let strct = chunk - .try_downcast::() - .expect("Struct"); - let url = strct.unmasked_field(0).clone(); - let mut ctx = SESSION.create_execution_ctx(); - let url = url.execute::(&mut ctx).expect("canon"); - url.with_iterator(|iter| { - for b in iter { - assert_eq!(b, Some(strings[row].as_bytes()), "row {row}"); - row += 1; - } - Ok::<_, vortex_error::VortexError>(()) - }) - .unwrap(); - } - assert_eq!(row, n); -} - -/// TPC-H supplier-shaped table: 5 string columns + a primary key + a -/// foreign key + a decimal/integer, with the row count large enough to -/// exercise multiple chunks. This is the configuration that surfaced the -/// `Misaligned buffer` error in CI. -#[tokio::test] -async fn tpch_supplier_shape() { - let n = 32_000usize; - let names = corpus(n, 1); - let addresses = corpus(n, 2); - let phones = corpus(n, 3); - let comments = corpus(n, 4); - let cities = corpus(n, 5); - - let suppkey: Vec = (0..n as i64).collect(); - let nationkey: Vec = (0..n as i32).map(|i| i % 25).collect(); - let acctbal: Vec = (0..n as i64).map(|i| (i * 13) % 1_000_000).collect(); - - let mk_str = |v: &[String]| -> vortex_array::ArrayRef { - VarBinViewArray::from_iter( - v.iter().map(|s| Some(s.as_str())), - DType::Utf8(Nullability::NonNullable), - ) - .into_array() - }; - - let data = StructArray::new( - FieldNames::from([ - "s_suppkey", - "s_name", - "s_address", - "s_nationkey", - "s_phone", - "s_acctbal", - "s_comment", - "s_city", - ]), - vec![ - PrimitiveArray::from_iter(suppkey.iter().copied()).into_array(), - mk_str(&names), - mk_str(&addresses), - PrimitiveArray::from_iter(nationkey.iter().copied()).into_array(), - mk_str(&phones), - PrimitiveArray::from_iter(acctbal.iter().copied()).into_array(), - mk_str(&comments), - mk_str(&cities), - ], - n, - Validity::NonNullable, - ) - .into_array(); - - let chunks = write_and_read_back(data).await; - - let mut row = 0; - for chunk in chunks { - let strct = chunk - .try_downcast::() - .expect("Struct"); - let chunk_len = strct.as_ref().len(); - let mut ctx = SESSION.create_execution_ctx(); - - let name = strct - .unmasked_field(1) - .clone() - .execute::(&mut ctx) - .unwrap(); - let address = strct - .unmasked_field(2) - .clone() - .execute::(&mut ctx) - .unwrap(); - let phone = strct - .unmasked_field(4) - .clone() - .execute::(&mut ctx) - .unwrap(); - let comment = strct - .unmasked_field(6) - .clone() - .execute::(&mut ctx) - .unwrap(); - let city = strct - .unmasked_field(7) - .clone() - .execute::(&mut ctx) - .unwrap(); - - for (s, want) in [ - (&name, &names), - (&address, &addresses), - (&phone, &phones), - (&comment, &comments), - (&city, &cities), - ] { - let base = row; - s.with_iterator(|iter| { - for (i, b) in iter.enumerate() { - assert_eq!(b, Some(want[base + i].as_bytes()), "row {}", base + i); - } - Ok::<_, vortex_error::VortexError>(()) - }) - .unwrap(); - } - row += chunk_len; - } - assert_eq!(row, n); -} - -/// 30 short fixed strings where the dictionary blob length is unlikely to -/// be a multiple of 4. Earlier buffer orderings (dict_bytes first) tripped -/// the segment writer's first-buffer-only alignment, surfacing -/// `Misaligned buffer cannot be used to build PrimitiveArray of u32` on -/// read. -#[tokio::test] -async fn odd_dict_length_alignment() { - let words: &[&str] = &[ - "a", "bb", "ccc", "dddd", "eeeee", "fffff", "ggggggg", "h", "ii", "jjj", - ]; - let n = 20_000usize; - let strings: Vec<&str> = (0..n).map(|i| words[i % words.len()]).collect(); - let str_array = VarBinViewArray::from_iter( - strings.iter().map(|s| Some(*s)), - DType::Utf8(Nullability::NonNullable), - ) - .into_array(); - let data = StructArray::new( - FieldNames::from(["w"]), - vec![str_array], - n, - Validity::NonNullable, - ) - .into_array(); - - let chunks = write_and_read_back(data).await; - let mut row = 0; - for chunk in chunks { - let strct = chunk - .try_downcast::() - .expect("Struct"); - let mut ctx = SESSION.create_execution_ctx(); - let s = strct - .unmasked_field(0) - .clone() - .execute::(&mut ctx) - .unwrap(); - s.with_iterator(|iter| { - for b in iter { - assert_eq!(b, Some(strings[row].as_bytes()), "row {row}"); - row += 1; - } - Ok::<_, vortex_error::VortexError>(()) - }) - .unwrap(); - } - assert_eq!(row, n); -} - -/// Mixed-shape strings: empty, short, very long, with a fair chunk of nulls -/// — exercising the validity child + edge offsets. -#[tokio::test] -async fn nullable_and_extreme_shapes() { - let n = 16_000usize; - let mut strings: Vec> = Vec::with_capacity(n); - for i in 0..n { - match i % 11 { - 0 => strings.push(None), - 1 => strings.push(Some(String::new())), - 2 => strings.push(Some("a".repeat(1024))), - 3 => strings.push(Some(format!("row-{i}"))), - _ => strings.push(Some(corpus(1, i as u64).pop().unwrap())), - } - } - let str_array = VarBinViewArray::from_iter( - strings.iter().map(|s| s.as_deref()), - DType::Utf8(Nullability::Nullable), - ) - .into_array(); - let data = StructArray::new( - FieldNames::from(["s"]), - vec![str_array], - n, - Validity::NonNullable, - ) - .into_array(); - - let chunks = write_and_read_back(data).await; - let mut row = 0; - for chunk in chunks { - let strct = chunk - .try_downcast::() - .expect("Struct"); - let mut ctx = SESSION.create_execution_ctx(); - let s = strct - .unmasked_field(0) - .clone() - .execute::(&mut ctx) - .unwrap(); - s.with_iterator(|iter| { - for b in iter { - let want = strings[row].as_deref().map(str::as_bytes); - assert_eq!(b, want, "row {row}"); - row += 1; - } - Ok::<_, vortex_error::VortexError>(()) - }) - .unwrap(); - } - assert_eq!(row, n); -} From 75f735cb5236b837116440337c8c93fdd8eb1a9d Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 11:00:47 +0100 Subject: [PATCH 06/27] fix Signed-off-by: Joe Isaacs --- encodings/experimental/onpair/src/decode.rs | 2 +- vortex-btrblocks/src/schemes/string.rs | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/encodings/experimental/onpair/src/decode.rs b/encodings/experimental/onpair/src/decode.rs index a7697977a70..c75eff10d22 100644 --- a/encodings/experimental/onpair/src/decode.rs +++ b/encodings/experimental/onpair/src/decode.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors // -//! Pure-Rust decoder for an [`OnPair`][crate::OnPair] array. +//! Pure-Rust decoder for an [`OnPair`] array. //! //! The decode loop is intentionally simple — one `u16` code load, one //! `u64` table load, one fixed 16-byte over-copy `memcpy` — so the diff --git a/vortex-btrblocks/src/schemes/string.rs b/vortex-btrblocks/src/schemes/string.rs index 2b73d3daab1..ec994797ff2 100644 --- a/vortex-btrblocks/src/schemes/string.rs +++ b/vortex-btrblocks/src/schemes/string.rs @@ -34,9 +34,10 @@ use crate::SchemeExt; /// FSST (Fast Static Symbol Table) compression. /// -/// One of the two string-fragmentation schemes in the default [`ALL_SCHEMES`] -/// (alongside [`OnPairScheme`]); the sample-based selector keeps whichever is -/// smaller per column. FSST compresses faster, OnPair usually wins on ratio. +/// One of the two string-fragmentation schemes in the default +/// [`crate::ALL_SCHEMES`] (alongside `OnPairScheme`); the sample-based selector +/// keeps whichever is smaller per column. FSST compresses faster, OnPair +/// usually wins on ratio. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct FSSTScheme; From a21ea30a5b156ed9a7f5cce0ed663644654448db Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 11:35:31 +0100 Subject: [PATCH 07/27] onpair: delegate decompression to the upstream onpair crate Replace the hand-rolled SIMD decoder (DecodeView::decode_rows_unchecked, build_dict_table) with onpair::decompress_into / decompress_row_into / decompressed_len. OwnedDecodeInputs is now just four flat host buffers plus a Parts<'_, u32> view; the hot loop lives upstream where the aarch64 NEON intrinsic path also lives. Bench (UrlLog, 1M rows): decompress_into median 8.4 ms, canonicalize_to_varbinview 14.7 ms. Adds num-traits as a direct dep to support the generic widen helpers (AsPrimitive::as_() side-steps clippy::cast_* lints on the match_each_integer_ptype! arms). Signed-off-by: Joe Isaacs --- Cargo.lock | 1 + encodings/experimental/onpair/Cargo.toml | 1 + .../experimental/onpair/benches/decode.rs | 38 +- .../experimental/onpair/src/canonical.rs | 41 +- encodings/experimental/onpair/src/decode.rs | 362 +++--------------- encodings/experimental/onpair/src/ops.rs | 14 +- 6 files changed, 103 insertions(+), 354 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dd93bc07661..421572289e9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9907,6 +9907,7 @@ version = "0.1.0" dependencies = [ "codspeed-divan-compat", "memchr", + "num-traits", "onpair", "prost 0.14.3", "rstest", diff --git a/encodings/experimental/onpair/Cargo.toml b/encodings/experimental/onpair/Cargo.toml index ba8c478570b..258568cc75a 100644 --- a/encodings/experimental/onpair/Cargo.toml +++ b/encodings/experimental/onpair/Cargo.toml @@ -18,6 +18,7 @@ workspace = true [dependencies] memchr = { workspace = true } +num-traits = { workspace = true } onpair = { workspace = true } prost = { workspace = true } vortex-array = { workspace = true } diff --git a/encodings/experimental/onpair/benches/decode.rs b/encodings/experimental/onpair/benches/decode.rs index 52a4ea77d87..2b77aae21ac 100644 --- a/encodings/experimental/onpair/benches/decode.rs +++ b/encodings/experimental/onpair/benches/decode.rs @@ -3,10 +3,9 @@ // //! Decode-path microbenchmarks for the OnPair Vortex array. //! -//! * `decode_rows_unchecked` — the production decoder hot loop (combined -//! `(offset << 16) | length` table, fixed 16-byte over-copy, 4× unrolled). -//! Measured by hand-driving `DecodeView::decode_rows_unchecked` straight -//! into a `Vec` so the time reflects the inner loop only. +//! * `decompress_into` — the upstream `onpair::decompress_into` decoder hot +//! loop, fed by a pre-materialised [`OwnedDecodeInputs`]. Measures the +//! inner loop only (no `collect`, no allocation). //! * `canonicalize_to_varbinview` — the full Vortex //! `OnPair → VarBinViewArray` path callers actually hit. Includes //! `OwnedDecodeInputs::collect`, the build_views step, allocation, etc. @@ -37,8 +36,10 @@ use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; use vortex_array::session::ArraySession; use vortex_mask::Mask; +use onpair::DECOMPRESS_BUFFER_PADDING; +use onpair::decompress_into; +use onpair::decompressed_len; use vortex_onpair::DEFAULT_DICT12_CONFIG; -use vortex_onpair::MAX_TOKEN_SIZE; use vortex_onpair::OnPair; use vortex_onpair::OnPairArray; use vortex_onpair::decode::OwnedDecodeInputs; @@ -126,18 +127,12 @@ fn compress(n: usize, shape: Shape) -> OnPairArray { .unwrap_or_else(|e| panic!("onpair_compress failed: {e}")) } -fn materialise(arr: &OnPairArray) -> (OwnedDecodeInputs, usize, usize) { +fn materialise(arr: &OnPairArray) -> (OwnedDecodeInputs, usize) { let mut ctx = SESSION.create_execution_ctx(); let inputs = OwnedDecodeInputs::collect(arr.as_view(), &mut ctx) .unwrap_or_else(|e| panic!("collect: {e}")); - let n = arr.len(); - let total: usize = inputs - .codes - .as_slice() - .iter() - .map(|&c| (inputs.dict_table.as_slice()[c as usize] & 0xffff) as usize) - .sum(); - (inputs, n, total) + let total = decompressed_len(inputs.as_parts()); + (inputs, total) } const CASES: &[(Shape, usize)] = &[ @@ -149,19 +144,16 @@ const CASES: &[(Shape, usize)] = &[ ]; /// Raw decode loop time, excluding `OwnedDecodeInputs::collect` and the -/// output allocation. Hits `DecodeView::decode_rows_unchecked` directly. +/// output allocation. Hits `onpair::decompress_into` directly. #[divan::bench(args = CASES)] -fn decode_rows_unchecked(bencher: Bencher, case: (Shape, usize)) { +fn decompress_into_bench(bencher: Bencher, case: (Shape, usize)) { let (shape, n) = case; let arr = compress(n, shape); - let (inputs, n_rows, total) = materialise(&arr); + let (inputs, total) = materialise(&arr); bencher.bench_local(|| { - let mut out: Vec = Vec::with_capacity(total + MAX_TOKEN_SIZE); - let dv = inputs.view(); - unsafe { - let written = dv.decode_rows_unchecked(0, n_rows, out.as_mut_ptr()); - out.set_len(written); - } + let mut out: Vec = Vec::with_capacity(total + DECOMPRESS_BUFFER_PADDING); + let written = decompress_into(inputs.as_parts(), out.spare_capacity_mut()); + unsafe { out.set_len(written) }; divan::black_box(out); }); } diff --git a/encodings/experimental/onpair/src/canonical.rs b/encodings/experimental/onpair/src/canonical.rs index 9837d63a35f..23b22251d37 100644 --- a/encodings/experimental/onpair/src/canonical.rs +++ b/encodings/experimental/onpair/src/canonical.rs @@ -1,11 +1,14 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors // -//! Convert an [`OnPairArray`] to its canonical `VarBinViewArray` by running -//! the pure-Rust dictionary-lookup decoder over every row. +//! Convert an [`OnPairArray`] to its canonical `VarBinViewArray` by handing +//! the materialised parts to `onpair::decompress_into`. use std::sync::Arc; +use onpair::DECOMPRESS_BUFFER_PADDING; +use onpair::decompress_into; +use onpair::decompressed_len; use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; @@ -42,37 +45,21 @@ pub(crate) fn onpair_decode_views( start_buf_index: u32, ctx: &mut ExecutionCtx, ) -> VortexResult<(Vec, Buffer)> { - let n = array.array().len(); let lengths = array .uncompressed_lengths() .clone() .execute::(ctx)?; - #[expect(clippy::cast_possible_truncation)] - let total_size: usize = match_each_integer_ptype!(lengths.ptype(), |P| { - lengths.as_slice::

().iter().map(|x| *x as usize).sum() - }); - let inputs = OwnedDecodeInputs::collect(array, ctx)?; - let dv = inputs.view(); - // Decode directly into the canonical output buffer's spare capacity — - // no temporary `Vec` + `extend_from_slice` round-trip. Total size - // is already known from `uncompressed_lengths`, so we can size the - // buffer once with the over-copy slack and call into the unchecked - // single-pass decoder. - let mut out_bytes = ByteBufferMut::with_capacity(total_size + crate::MAX_TOKEN_SIZE); - // SAFETY: - // * `out_bytes` reserved at least `total_size + MAX_TOKEN_SIZE` bytes - // above; `decode_rows_unchecked` may over-copy up to MAX_TOKEN_SIZE - // bytes past the true end, all within reserved capacity. - // * Caller has verified the array's invariants in `OnPair::try_new`, - // so every code is a valid index and `dict_bytes` is padded. - unsafe { - let dst = out_bytes.spare_capacity_mut().as_mut_ptr().cast::(); - let written = dv.decode_rows_unchecked(0, n, dst); - debug_assert_eq!(written, total_size); - out_bytes.set_len(written); - } + let parts = inputs.as_parts(); + let total_size = decompressed_len(parts); + + let mut out_bytes = ByteBufferMut::with_capacity(total_size + DECOMPRESS_BUFFER_PADDING); + let written = decompress_into(parts, out_bytes.spare_capacity_mut()); + debug_assert_eq!(written, total_size); + // SAFETY: `decompress_into` initialised exactly `written` bytes of the + // spare capacity reserved above. + unsafe { out_bytes.set_len(written) }; match_each_integer_ptype!(lengths.ptype(), |P| { Ok(build_views( diff --git a/encodings/experimental/onpair/src/decode.rs b/encodings/experimental/onpair/src/decode.rs index c75eff10d22..27eb1321e7a 100644 --- a/encodings/experimental/onpair/src/decode.rs +++ b/encodings/experimental/onpair/src/decode.rs @@ -1,20 +1,19 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors // -//! Pure-Rust decoder for an [`OnPair`] array. -//! -//! The decode loop is intentionally simple — one `u16` code load, one -//! `u64` table load, one fixed 16-byte over-copy `memcpy` — so the -//! autovectoriser keeps the hot path SIMD-friendly. We materialise the -//! children once into native-aligned `Buffer`s (and pack the dict -//! offsets + lengths into a single `Buffer` lookup table) so the -//! inner loop indexes straight into raw slices with no branches. - +//! Bridge between [`OnPair`] slot children and the upstream `onpair` crate's +//! decompression API. We materialise the dictionary blob and the three +//! integer children into native-aligned host buffers once, then hand the +//! result to [`onpair::decompress_into`] / [`onpair::decompress_row_into`]. +//! The hot decode loop lives in the `onpair` crate. + +use num_traits::AsPrimitive; +use onpair::Parts; use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; use vortex_array::arrays::PrimitiveArray; -use vortex_array::dtype::PType; +use vortex_array::dtype::NativePType; use vortex_array::match_each_integer_ptype; use vortex_buffer::Buffer; use vortex_buffer::BufferMut; @@ -26,321 +25,82 @@ use crate::OnPairArraySlotsExt; /// Materialised, host-resident copies of every read path's input. /// -/// Each integer child (`dict_offsets`, `codes`, `codes_offsets`) is a slot -/// on the outer `OnPair` array, possibly wrapped in a non-canonical -/// encoding the cascading compressor chose (e.g. FastLanes-bit-packed -/// `codes`, `narrow`-ed dict offsets). `execute::` may -/// hand us back a narrower ptype than the decode loop wants. `collect` -/// widens each child to the decoder's native width (`u32` for both offset -/// arrays, `u16` for codes) once so the inner loop is branch-free pointer -/// arithmetic. -/// -/// Construction also packs `dict_offsets` into the combined -/// `(offset << 16) | length` `dict_table` so the decode hot loop loads a -/// single `u64` per token instead of two adjacent `u32`s. +/// Each integer child (`dict_offsets`, `codes`, `codes_offsets`) is a slot on +/// the outer `OnPair` array, possibly wrapped in a non-canonical encoding the +/// cascading compressor chose (e.g. FastLanes-bit-packed `codes`, narrowed +/// dict offsets). `collect` runs `execute::` once per child +/// and widens each to the decoder's native width (`u32` for both offset +/// arrays, `u16` for codes) so [`Self::as_parts`] can hand a borrowed +/// [`Parts`] view to the upstream decoder. pub struct OwnedDecodeInputs { pub dict_bytes: ByteBuffer, - /// `(dict_offset << 16) | dict_len` per token. `dict_len` ≤ - /// `MAX_TOKEN_SIZE = 16` so 16 bits suffice. - pub dict_table: Buffer, + pub dict_offsets: Buffer, pub codes: Buffer, - pub codes_offsets: Buffer, + pub code_boundaries: Buffer, + pub bits: u32, } impl OwnedDecodeInputs { pub fn collect(array: ArrayView<'_, OnPair>, ctx: &mut ExecutionCtx) -> VortexResult { - let dict_offsets_arr = to_primitive(array.dict_offsets(), ctx)?; - let dict_table = build_dict_table(&dict_offsets_arr); Ok(Self { dict_bytes: array.dict_bytes().clone(), - dict_table, - codes: widen_to_u16(&to_primitive(array.codes(), ctx)?), - codes_offsets: widen_to_u32(&to_primitive(array.codes_offsets(), ctx)?), + dict_offsets: widen_to::(&to_primitive(array.dict_offsets(), ctx)?), + codes: widen_to::(&to_primitive(array.codes(), ctx)?), + code_boundaries: widen_to::(&to_primitive(array.codes_offsets(), ctx)?), + bits: array.bits(), }) } - pub fn view(&self) -> DecodeView<'_> { - DecodeView { + /// Borrowed [`Parts`] view consumed by `onpair::decompress*`. + pub fn as_parts(&self) -> Parts<'_, u32> { + Parts { dict_bytes: self.dict_bytes.as_slice(), - dict_table: self.dict_table.as_slice(), + dict_offsets: self.dict_offsets.as_slice(), + bits: self.bits, codes: self.codes.as_slice(), - codes_offsets: self.codes_offsets.as_slice(), + code_boundaries: self.code_boundaries.as_slice(), } } } -/// Pack `dict_offsets` directly into `(offset << 16) | length` per token. -/// Reads through the integer-ptype macro once so we don't have to widen -/// the offsets buffer first — saves one `Vec` allocation in the common -/// (non-narrowed) case. -#[allow( - clippy::cast_lossless, - clippy::cast_possible_truncation, - clippy::cast_sign_loss, - clippy::unnecessary_cast -)] -fn build_dict_table(arr: &PrimitiveArray) -> Buffer { - match_each_integer_ptype!(arr.ptype(), |P| { - let slice = arr.as_slice::

(); - if slice.is_empty() { - return Buffer::::copy_from(Vec::::new()); - } - let dict_size = slice.len() - 1; - let mut table = BufferMut::::with_capacity(dict_size); - for i in 0..dict_size { - let off = slice[i] as u64; - let len = (slice[i + 1] - slice[i]) as u64; - // SAFETY: capacity reserved above; we push exactly dict_size times. - unsafe { table.push_unchecked((off << 16) | len) }; - } - table.freeze() - }) -} - fn to_primitive(arr: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { arr.clone().execute::(ctx) } -/// Widen any integer-typed `PrimitiveArray` to `Buffer`. When the -/// underlying ptype already matches we transmute the buffer instead of -/// allocating a new one. Used when the cascading compressor narrowed an -/// offset array (e.g. `u32` → `u16`). -#[allow( - clippy::cast_lossless, - clippy::cast_possible_truncation, - clippy::cast_sign_loss, - clippy::unnecessary_cast -)] -fn widen_to_u32(arr: &PrimitiveArray) -> Buffer { - if arr.ptype() == PType::U32 { - // Cheap: PrimitiveArray's underlying buffer is Arc-shared, so - // `into_buffer` on a clone is effectively a refcount bump. - return arr.clone().into_buffer::(); +/// Widen any integer-typed `PrimitiveArray` to `Buffer`. If the underlying +/// ptype already matches `T` we share the existing buffer (an Arc refcount +/// bump, no copy); otherwise we dispatch on ptype and run an element-wise +/// `AsPrimitive::as_()` cast via [`widen`]. +fn widen_to(arr: &PrimitiveArray) -> Buffer +where + T: NativePType, + u8: AsPrimitive, + i8: AsPrimitive, + u16: AsPrimitive, + i16: AsPrimitive, + u32: AsPrimitive, + i32: AsPrimitive, + u64: AsPrimitive, + i64: AsPrimitive, +{ + if arr.ptype() == T::PTYPE { + return arr.clone().into_buffer::(); } - match_each_integer_ptype!(arr.ptype(), |P| { - let slice = arr.as_slice::

(); - let mut out = BufferMut::::with_capacity(slice.len()); - for &v in slice { - // SAFETY: capacity reserved above. - unsafe { out.push_unchecked(v as u32) }; - } - out.freeze() - }) + match_each_integer_ptype!(arr.ptype(), |P| { widen::(arr.as_slice::

()) }) } -/// As `widen_to_u32` but for `Buffer`. -#[allow( - clippy::cast_lossless, - clippy::cast_possible_truncation, - clippy::cast_sign_loss, - clippy::unnecessary_cast -)] -fn widen_to_u16(arr: &PrimitiveArray) -> Buffer { - if arr.ptype() == PType::U16 { - return arr.clone().into_buffer::(); - } - match_each_integer_ptype!(arr.ptype(), |P| { - let slice = arr.as_slice::

(); - let mut out = BufferMut::::with_capacity(slice.len()); - for &v in slice { - // SAFETY: capacity reserved above. - unsafe { out.push_unchecked(v as u16) }; - } - out.freeze() - }) -} - -/// Borrowed slices for the decode loop. -#[derive(Copy, Clone)] -pub struct DecodeView<'a> { - pub dict_bytes: &'a [u8], - pub dict_table: &'a [u64], - pub codes: &'a [u16], - pub codes_offsets: &'a [u32], -} - -impl<'a> DecodeView<'a> { - /// Decode row `row` into `out` (appended). Thin wrapper around - /// [`Self::decode_rows_into`]. - #[inline] - pub fn decode_row_into(&self, row: usize, out: &mut Vec) { - self.decode_rows_into(row, 1, out); - } - - /// Bulk decode rows `[start, start + count)` contiguously into `out`. - /// Pre-computes the decoded length, reserves once, then delegates to - /// the unrolled fast path. Callers that already know the size (e.g. - /// canonicalize from `uncompressed_lengths`) should call - /// [`Self::decode_rows_into_with_size`] to skip the size pre-pass. - pub fn decode_rows_into(&self, start: usize, count: usize, out: &mut Vec) { - if count == 0 { - return; - } - let decoded_len = self.decoded_len_rows(start, count); - let written_start = out.len(); - out.reserve(decoded_len + crate::MAX_TOKEN_SIZE); - // SAFETY: capacity reserved above; `decode_rows_unchecked`'s - // invariants are upheld by the [`OnPair::try_new`] validation. - unsafe { - let written = - self.decode_rows_unchecked(start, count, out.as_mut_ptr().add(written_start)); - debug_assert_eq!(written, decoded_len); - out.set_len(written_start + written); - } - } - - /// Single-pass over-copy decode of a token window into raw `dst`. - /// - /// Each iteration loads one `u16` code, one `u64` dict-table - /// entry, issues a fixed [`MAX_TOKEN_SIZE`][crate::MAX_TOKEN_SIZE] - /// `copy_nonoverlapping` (which LLVM lowers to a single unaligned - /// 128-bit SIMD store on x86_64 / aarch64), and advances the cursor by - /// the *true* token length. The body is hand-unrolled four times so - /// the CPU can keep four independent stores in flight, matching the - /// `ONPAIR_EMIT4` block of the upstream `decode_all.h`. - /// - /// Returns the number of *true* bytes written. - /// - /// # Safety - /// * `dst` must point into a region with at least - /// `decoded_byte_length + MAX_TOKEN_SIZE` bytes of writable - /// uninitialised capacity. - /// * `self.dict_bytes` must have at least `MAX_TOKEN_SIZE` trailing - /// pad bytes past the last real token byte (`compress.rs` enforces - /// this). - /// * Every `code` in the window must be `< self.dict_table.len()`. - #[inline] - pub unsafe fn decode_rows_unchecked(&self, start: usize, count: usize, dst: *mut u8) -> usize { - if count == 0 { - return 0; - } - // SAFETY: caller invariants. - let lo = unsafe { *self.codes_offsets.get_unchecked(start) } as usize; - let hi = unsafe { *self.codes_offsets.get_unchecked(start + count) } as usize; - - let codes_ptr = self.codes.as_ptr(); - let table_ptr = self.dict_table.as_ptr(); - let dict_ptr = self.dict_bytes.as_ptr(); - - let mut cursor = dst; - let unroll_end = lo + ((hi - lo) & !3); - let mut i = lo; - // SAFETY: indices derived from validated offsets; the 16-byte - // over-copy reads stay within `dict_bytes`'s trailing pad; writes - // stay within the caller-promised capacity. - unsafe { - while i < unroll_end { - macro_rules! emit { - ($k:expr) => {{ - let c = *codes_ptr.add(i + $k) as usize; - let entry = *table_ptr.add(c); - let off = (entry >> 16) as usize; - let len = (entry & 0xffff) as usize; - std::ptr::copy_nonoverlapping( - dict_ptr.add(off), - cursor, - crate::MAX_TOKEN_SIZE, - ); - cursor = cursor.add(len); - }}; - } - emit!(0); - emit!(1); - emit!(2); - emit!(3); - i += 4; - } - while i < hi { - let c = *codes_ptr.add(i) as usize; - let entry = *table_ptr.add(c); - let off = (entry >> 16) as usize; - let len = (entry & 0xffff) as usize; - std::ptr::copy_nonoverlapping(dict_ptr.add(off), cursor, crate::MAX_TOKEN_SIZE); - cursor = cursor.add(len); - i += 1; - } - cursor.offset_from(dst) as usize - } - } - - /// Single-pass decode when the caller already knows the total decoded - /// byte length (e.g. from summing `uncompressed_lengths`). Skips the - /// size-precomputation pass. - /// - /// # Safety - /// `out.capacity() - out.len() >= total_size + MAX_TOKEN_SIZE` and - /// `total_size` equals the true decoded length. - #[inline] - pub unsafe fn decode_rows_into_with_size( - &self, - start: usize, - count: usize, - total_size: usize, - out: &mut Vec, - ) { - let written_start = out.len(); - debug_assert!(out.capacity() - written_start >= total_size + crate::MAX_TOKEN_SIZE); - // SAFETY: caller's invariants. - let written = unsafe { - self.decode_rows_unchecked(start, count, out.as_mut_ptr().add(written_start)) - }; - debug_assert_eq!(written, total_size); - // SAFETY: `written` ≤ reserved capacity (caller invariants). - unsafe { out.set_len(written_start + written) }; - } - - /// Decoded byte length of row `row` without copying any bytes. - #[inline] - pub fn decoded_len(&self, row: usize) -> usize { - self.decoded_len_rows(row, 1) - } - - /// Decoded byte length of rows `[start, start + count)`. Uses the - /// combined `dict_table` — one `u64` load per token. - #[inline] - pub fn decoded_len_rows(&self, start: usize, count: usize) -> usize { - if count == 0 { - return 0; - } - let lo = self.codes_offsets[start] as usize; - let hi = self.codes_offsets[start + count] as usize; - let mut total = 0usize; - // SAFETY: bounds checked by indexing above. - unsafe { - for i in lo..hi { - let c = *self.codes.get_unchecked(i) as usize; - total += (*self.dict_table.get_unchecked(c) & 0xffff) as usize; - } - } - total - } - - /// Iterate the decoded bytes of `row` without materialising the full - /// row, calling `f` on each contiguous dict slice. Returns - /// - /// * `true` if every slice was visited (i.e. `f` always returned - /// `true`), - /// * `false` if `f` short-circuited with `false`. - /// - /// Useful for predicates that can short-circuit, e.g. `equals` and - /// `starts_with`. - #[inline] - pub fn for_each_dict_slice bool>(&self, row: usize, mut f: F) -> bool { - let lo = self.codes_offsets[row] as usize; - let hi = self.codes_offsets[row + 1] as usize; - let codes = &self.codes[lo..hi]; - // SAFETY: codes were validated at construction time. - unsafe { - for &c in codes { - let entry = *self.dict_table.get_unchecked(c as usize); - let off = (entry >> 16) as usize; - let len = (entry & 0xffff) as usize; - let slice = self.dict_bytes.get_unchecked(off..off + len); - if !f(slice) { - return false; - } - } - } - true +/// Element-wise widen from `&[P]` to `Buffer` via [`AsPrimitive`]. +/// Method-call casts side-step the `clippy::cast_*` lints that `as` triggers +/// on each ptype arm of `match_each_integer_ptype!`. +fn widen(slice: &[P]) -> Buffer +where + P: NativePType + AsPrimitive, + T: NativePType, +{ + let mut out = BufferMut::::with_capacity(slice.len()); + for &v in slice { + // SAFETY: capacity reserved above. + unsafe { out.push_unchecked(v.as_()) }; } + out.freeze() } diff --git a/encodings/experimental/onpair/src/ops.rs b/encodings/experimental/onpair/src/ops.rs index 55e6c77b1e0..5d5b17f4eec 100644 --- a/encodings/experimental/onpair/src/ops.rs +++ b/encodings/experimental/onpair/src/ops.rs @@ -1,6 +1,9 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use onpair::DECOMPRESS_BUFFER_PADDING; +use onpair::decompress_row_into; +use onpair::decompressed_row_len; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; use vortex_array::arrays::varbin::varbin_scalar; @@ -19,9 +22,14 @@ impl OperationsVTable for OnPair { ctx: &mut ExecutionCtx, ) -> VortexResult { let inputs = OwnedDecodeInputs::collect(array, ctx)?; - let dv = inputs.view(); - let mut buf: Vec = Vec::with_capacity(dv.decoded_len(index)); - dv.decode_row_into(index, &mut buf); + let parts = inputs.as_parts(); + let len = decompressed_row_len(parts, index); + let mut buf: Vec = Vec::with_capacity(len + DECOMPRESS_BUFFER_PADDING); + let written = decompress_row_into(parts, index, buf.spare_capacity_mut()); + debug_assert_eq!(written, len); + // SAFETY: `decompress_row_into` initialised `written` bytes of the + // spare capacity reserved above. + unsafe { buf.set_len(written) }; Ok(varbin_scalar(ByteBuffer::from(buf), array.dtype())) } } From 3c6006a08fd5e9c4e224fb429358a036dc8771f2 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 11:50:04 +0100 Subject: [PATCH 08/27] fix Signed-off-by: Joe Isaacs --- .../experimental/onpair/benches/decode.rs | 8 +- .../experimental/onpair/src/canonical.rs | 7 +- encodings/experimental/onpair/src/decode.rs | 108 ++++++++---------- encodings/experimental/onpair/src/ops.rs | 7 +- 4 files changed, 53 insertions(+), 77 deletions(-) diff --git a/encodings/experimental/onpair/benches/decode.rs b/encodings/experimental/onpair/benches/decode.rs index 2b77aae21ac..6be4d0e82ad 100644 --- a/encodings/experimental/onpair/benches/decode.rs +++ b/encodings/experimental/onpair/benches/decode.rs @@ -27,6 +27,7 @@ use std::sync::LazyLock; use divan::Bencher; +use onpair::DECOMPRESS_BUFFER_PADDING; use vortex_array::IntoArray; use vortex_array::VortexSessionExecute; use vortex_array::arrays::VarBinArray; @@ -36,9 +37,6 @@ use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; use vortex_array::session::ArraySession; use vortex_mask::Mask; -use onpair::DECOMPRESS_BUFFER_PADDING; -use onpair::decompress_into; -use onpair::decompressed_len; use vortex_onpair::DEFAULT_DICT12_CONFIG; use vortex_onpair::OnPair; use vortex_onpair::OnPairArray; @@ -131,7 +129,7 @@ fn materialise(arr: &OnPairArray) -> (OwnedDecodeInputs, usize) { let mut ctx = SESSION.create_execution_ctx(); let inputs = OwnedDecodeInputs::collect(arr.as_view(), &mut ctx) .unwrap_or_else(|e| panic!("collect: {e}")); - let total = decompressed_len(inputs.as_parts()); + let total = inputs.decompressed_len(); (inputs, total) } @@ -152,7 +150,7 @@ fn decompress_into_bench(bencher: Bencher, case: (Shape, usize)) { let (inputs, total) = materialise(&arr); bencher.bench_local(|| { let mut out: Vec = Vec::with_capacity(total + DECOMPRESS_BUFFER_PADDING); - let written = decompress_into(inputs.as_parts(), out.spare_capacity_mut()); + let written = inputs.decompress_into(out.spare_capacity_mut()); unsafe { out.set_len(written) }; divan::black_box(out); }); diff --git a/encodings/experimental/onpair/src/canonical.rs b/encodings/experimental/onpair/src/canonical.rs index 23b22251d37..1801619aedf 100644 --- a/encodings/experimental/onpair/src/canonical.rs +++ b/encodings/experimental/onpair/src/canonical.rs @@ -7,8 +7,6 @@ use std::sync::Arc; use onpair::DECOMPRESS_BUFFER_PADDING; -use onpair::decompress_into; -use onpair::decompressed_len; use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; @@ -51,11 +49,10 @@ pub(crate) fn onpair_decode_views( .execute::(ctx)?; let inputs = OwnedDecodeInputs::collect(array, ctx)?; - let parts = inputs.as_parts(); - let total_size = decompressed_len(parts); + let total_size = inputs.decompressed_len(); let mut out_bytes = ByteBufferMut::with_capacity(total_size + DECOMPRESS_BUFFER_PADDING); - let written = decompress_into(parts, out_bytes.spare_capacity_mut()); + let written = inputs.decompress_into(out_bytes.spare_capacity_mut()); debug_assert_eq!(written, total_size); // SAFETY: `decompress_into` initialised exactly `written` bytes of the // spare capacity reserved above. diff --git a/encodings/experimental/onpair/src/decode.rs b/encodings/experimental/onpair/src/decode.rs index 27eb1321e7a..3ca304c8dec 100644 --- a/encodings/experimental/onpair/src/decode.rs +++ b/encodings/experimental/onpair/src/decode.rs @@ -2,21 +2,20 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors // //! Bridge between [`OnPair`] slot children and the upstream `onpair` crate's -//! decompression API. We materialise the dictionary blob and the three -//! integer children into native-aligned host buffers once, then hand the -//! result to [`onpair::decompress_into`] / [`onpair::decompress_row_into`]. -//! The hot decode loop lives in the `onpair` crate. +//! decompression API. + +use std::mem::MaybeUninit; -use num_traits::AsPrimitive; use onpair::Parts; use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; use vortex_array::arrays::PrimitiveArray; +use vortex_array::builtins::ArrayBuiltins; +use vortex_array::dtype::DType; use vortex_array::dtype::NativePType; -use vortex_array::match_each_integer_ptype; +use vortex_array::dtype::Nullability; use vortex_buffer::Buffer; -use vortex_buffer::BufferMut; use vortex_buffer::ByteBuffer; use vortex_error::VortexResult; @@ -24,14 +23,6 @@ use crate::OnPair; use crate::OnPairArraySlotsExt; /// Materialised, host-resident copies of every read path's input. -/// -/// Each integer child (`dict_offsets`, `codes`, `codes_offsets`) is a slot on -/// the outer `OnPair` array, possibly wrapped in a non-canonical encoding the -/// cascading compressor chose (e.g. FastLanes-bit-packed `codes`, narrowed -/// dict offsets). `collect` runs `execute::` once per child -/// and widens each to the decoder's native width (`u32` for both offset -/// arrays, `u16` for codes) so [`Self::as_parts`] can hand a borrowed -/// [`Parts`] view to the upstream decoder. pub struct OwnedDecodeInputs { pub dict_bytes: ByteBuffer, pub dict_offsets: Buffer, @@ -42,17 +33,52 @@ pub struct OwnedDecodeInputs { impl OwnedDecodeInputs { pub fn collect(array: ArrayView<'_, OnPair>, ctx: &mut ExecutionCtx) -> VortexResult { + /// Cast `arr` to `Primitive(T::PTYPE, _)` then execute to Primitive + fn cast_and_collect( + arr: &ArrayRef, + ctx: &mut ExecutionCtx, + ) -> VortexResult> { + let dtype = DType::Primitive(T::PTYPE, Nullability::NonNullable); + let prim = arr.cast(dtype)?.execute::(ctx)?; + Ok(prim.into_buffer::()) + } + Ok(Self { dict_bytes: array.dict_bytes().clone(), - dict_offsets: widen_to::(&to_primitive(array.dict_offsets(), ctx)?), - codes: widen_to::(&to_primitive(array.codes(), ctx)?), - code_boundaries: widen_to::(&to_primitive(array.codes_offsets(), ctx)?), + dict_offsets: cast_and_collect::(array.dict_offsets(), ctx)?, + codes: cast_and_collect::(array.codes(), ctx)?, + code_boundaries: cast_and_collect::(array.codes_offsets(), ctx)?, bits: array.bits(), }) } - /// Borrowed [`Parts`] view consumed by `onpair::decompress*`. - pub fn as_parts(&self) -> Parts<'_, u32> { + /// Total decoded byte length across all rows. + #[inline] + pub fn decompressed_len(&self) -> usize { + onpair::decompressed_len(self.as_parts()) + } + + /// Decoded byte length of a single row. + #[inline] + pub fn decompressed_row_len(&self, row: usize) -> usize { + onpair::decompressed_row_len(self.as_parts(), row) + } + + /// Decode every row contiguously into `out`. Returns the number of + /// initialised bytes. + #[inline] + pub fn decompress_into(&self, out: &mut [MaybeUninit]) -> usize { + onpair::decompress_into(self.as_parts(), out) + } + + /// Decode a single row into `out`. Returns the number of initialised + /// bytes. + #[inline] + pub fn decompress_row_into(&self, row: usize, out: &mut [MaybeUninit]) -> usize { + onpair::decompress_row_into(self.as_parts(), row, out) + } + + fn as_parts(&self) -> Parts<'_, u32> { Parts { dict_bytes: self.dict_bytes.as_slice(), dict_offsets: self.dict_offsets.as_slice(), @@ -62,45 +88,3 @@ impl OwnedDecodeInputs { } } } - -fn to_primitive(arr: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { - arr.clone().execute::(ctx) -} - -/// Widen any integer-typed `PrimitiveArray` to `Buffer`. If the underlying -/// ptype already matches `T` we share the existing buffer (an Arc refcount -/// bump, no copy); otherwise we dispatch on ptype and run an element-wise -/// `AsPrimitive::as_()` cast via [`widen`]. -fn widen_to(arr: &PrimitiveArray) -> Buffer -where - T: NativePType, - u8: AsPrimitive, - i8: AsPrimitive, - u16: AsPrimitive, - i16: AsPrimitive, - u32: AsPrimitive, - i32: AsPrimitive, - u64: AsPrimitive, - i64: AsPrimitive, -{ - if arr.ptype() == T::PTYPE { - return arr.clone().into_buffer::(); - } - match_each_integer_ptype!(arr.ptype(), |P| { widen::(arr.as_slice::

()) }) -} - -/// Element-wise widen from `&[P]` to `Buffer` via [`AsPrimitive`]. -/// Method-call casts side-step the `clippy::cast_*` lints that `as` triggers -/// on each ptype arm of `match_each_integer_ptype!`. -fn widen(slice: &[P]) -> Buffer -where - P: NativePType + AsPrimitive, - T: NativePType, -{ - let mut out = BufferMut::::with_capacity(slice.len()); - for &v in slice { - // SAFETY: capacity reserved above. - unsafe { out.push_unchecked(v.as_()) }; - } - out.freeze() -} diff --git a/encodings/experimental/onpair/src/ops.rs b/encodings/experimental/onpair/src/ops.rs index 5d5b17f4eec..c947de84ec8 100644 --- a/encodings/experimental/onpair/src/ops.rs +++ b/encodings/experimental/onpair/src/ops.rs @@ -2,8 +2,6 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors use onpair::DECOMPRESS_BUFFER_PADDING; -use onpair::decompress_row_into; -use onpair::decompressed_row_len; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; use vortex_array::arrays::varbin::varbin_scalar; @@ -22,10 +20,9 @@ impl OperationsVTable for OnPair { ctx: &mut ExecutionCtx, ) -> VortexResult { let inputs = OwnedDecodeInputs::collect(array, ctx)?; - let parts = inputs.as_parts(); - let len = decompressed_row_len(parts, index); + let len = inputs.decompressed_row_len(index); let mut buf: Vec = Vec::with_capacity(len + DECOMPRESS_BUFFER_PADDING); - let written = decompress_row_into(parts, index, buf.spare_capacity_mut()); + let written = inputs.decompress_row_into(index, buf.spare_capacity_mut()); debug_assert_eq!(written, len); // SAFETY: `decompress_row_into` initialised `written` bytes of the // spare capacity reserved above. From 5b4b047e72ece66e27fc227776807ec4d50981b8 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 13:45:49 +0100 Subject: [PATCH 09/27] onpair: canonicalise offsets children before widening in decode The previous read path went through `arr.cast(u32).execute::`, which is unsafe when the cascading compressor wrapped the offsets in a `Delta` encoding: the Delta cast kernel preserves the Delta wrapping and widens the inner bases/deltas, but the fastlanes Delta bases-per-chunk layout is keyed on LANES (e.g. u8 = 64, u32 = 16). Widening misaligns the bases lookup in `Delta::undelta`, so the decoded values are not absolute monotonic offsets and `onpair::Parts` panics with "dictionary offsets must be nondecreasing" (seen on the Euro2016 compress-bench). Fix: call `execute::` first so the cascading encoding is decoded to absolute primitive values, then widen element-wise via `AsPrimitive::as_()`. Also add a roundtrip regression case in `vortex-btrblocks/tests/onpair_roundtrip.rs`. Signed-off-by: Joe Isaacs --- encodings/experimental/onpair/src/decode.rs | 52 ++++++++++++++++----- vortex-btrblocks/tests/onpair_roundtrip.rs | 32 +++++++++++++ 2 files changed, 72 insertions(+), 12 deletions(-) diff --git a/encodings/experimental/onpair/src/decode.rs b/encodings/experimental/onpair/src/decode.rs index 3ca304c8dec..ecf26c73ec7 100644 --- a/encodings/experimental/onpair/src/decode.rs +++ b/encodings/experimental/onpair/src/decode.rs @@ -6,16 +6,16 @@ use std::mem::MaybeUninit; +use num_traits::AsPrimitive; use onpair::Parts; use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; use vortex_array::arrays::PrimitiveArray; -use vortex_array::builtins::ArrayBuiltins; -use vortex_array::dtype::DType; use vortex_array::dtype::NativePType; -use vortex_array::dtype::Nullability; +use vortex_array::match_each_integer_ptype; use vortex_buffer::Buffer; +use vortex_buffer::BufferMut; use vortex_buffer::ByteBuffer; use vortex_error::VortexResult; @@ -33,21 +33,49 @@ pub struct OwnedDecodeInputs { impl OwnedDecodeInputs { pub fn collect(array: ArrayView<'_, OnPair>, ctx: &mut ExecutionCtx) -> VortexResult { - /// Cast `arr` to `Primitive(T::PTYPE, _)` then execute to Primitive - fn cast_and_collect( + // Canonicalise each child to a PrimitiveArray first (decoding any + // cascading encoding the compressor chose — Delta, FastLanes bit-pack, + // narrowing — to absolute primitive values), then widen element-wise + // to the decoder's native width. Going through `cast(dtype).execute()` + // is unsafe here: the `Delta` cast kernel preserves the Delta wrapping + // and only widens the inner bases/deltas, but the fastlanes + // bases-per-chunk layout is keyed on LANES (e.g. u8 → 64, u32 → 16), + // so the widened Delta decodes against misaligned bases and produces + // non-monotonic offsets. + fn collect_widened( arr: &ArrayRef, ctx: &mut ExecutionCtx, - ) -> VortexResult> { - let dtype = DType::Primitive(T::PTYPE, Nullability::NonNullable); - let prim = arr.cast(dtype)?.execute::(ctx)?; - Ok(prim.into_buffer::()) + ) -> VortexResult> + where + u8: AsPrimitive, + i8: AsPrimitive, + u16: AsPrimitive, + i16: AsPrimitive, + u32: AsPrimitive, + i32: AsPrimitive, + u64: AsPrimitive, + i64: AsPrimitive, + { + let prim = arr.clone().execute::(ctx)?; + if prim.ptype() == T::PTYPE { + return Ok(prim.into_buffer::()); + } + Ok(match_each_integer_ptype!(prim.ptype(), |P| { + let slice = prim.as_slice::

(); + let mut out = BufferMut::::with_capacity(slice.len()); + for &v in slice { + // SAFETY: capacity reserved above. + unsafe { out.push_unchecked(v.as_()) }; + } + out.freeze() + })) } Ok(Self { dict_bytes: array.dict_bytes().clone(), - dict_offsets: cast_and_collect::(array.dict_offsets(), ctx)?, - codes: cast_and_collect::(array.codes(), ctx)?, - code_boundaries: cast_and_collect::(array.codes_offsets(), ctx)?, + dict_offsets: collect_widened::(array.dict_offsets(), ctx)?, + codes: collect_widened::(array.codes(), ctx)?, + code_boundaries: collect_widened::(array.codes_offsets(), ctx)?, bits: array.bits(), }) } diff --git a/vortex-btrblocks/tests/onpair_roundtrip.rs b/vortex-btrblocks/tests/onpair_roundtrip.rs index 1cef6d471b8..5cae5f3df2b 100644 --- a/vortex-btrblocks/tests/onpair_roundtrip.rs +++ b/vortex-btrblocks/tests/onpair_roundtrip.rs @@ -125,6 +125,38 @@ fn nullable_roundtrip_via_default_compressor() { .unwrap(); } +/// Larger corpus that exercises the offsets-narrowing / delta-encoding paths +/// the cascading compressor enables past 2048 entries. The decoder must +/// reconstruct absolute u32 offsets from whatever encoded shape the +/// compressor chose for each child. +#[test] +fn large_unique_short_strings_roundtrip() { + let n = 1 << 13; // 8192 rows, all unique, short. + let strings: Vec = (0..n).map(|i| format!("k{i:05x}")).collect(); + let array = VarBinViewArray::from_iter( + strings.iter().map(|s| Some(s.as_str())), + DType::Utf8(Nullability::NonNullable), + ) + .into_array(); + + let compressed = BtrBlocksCompressor::default() + .compress(&array, &mut SESSION.create_execution_ctx()) + .expect("compress"); + + let decoded = compressed + .execute::(&mut SESSION.create_execution_ctx()) + .expect("decompress"); + assert_eq!(decoded.len(), n); + decoded + .with_iterator(|iter| { + for (i, got) in iter.enumerate() { + assert_eq!(got, Some(strings[i].as_bytes()), "row {i}"); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} + #[test] fn empty_and_short_string_roundtrip() { // Edge cases: empty strings interleaved with short ones. From 6bc07df34af994b86d186142584ffb3da4f59ad7 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 14:02:34 +0100 Subject: [PATCH 10/27] onpair: add regression test reproducing the delta dict_offsets panic The fix in 50cb91041 was missing a test that actually reproduces the Euro2016 compress-bench panic. `large_unique_short_strings_roundtrip` produces a `bitpacked(u16)` dict_offsets, whose cast widens correctly, so it passes against the buggy decode path too. `delta_dict_offsets_roundtrip` uses a 64k-row high-cardinality corpus that fills the dictionary toward the 4096-entry cap, forcing `dict_offsets` into a multi-chunk `Delta` (len > 1024). Against the old `arr.cast(u32).execute()` path this reconstructs non-monotonic offsets and panics in `onpair::decompress` with "dictionary offsets must be nondecreasing"; with the canonicalise-then-widen fix it round-trips. Verified: fails on the pre-fix decode.rs, passes after. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Joe Isaacs --- vortex-btrblocks/tests/onpair_roundtrip.rs | 43 ++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/vortex-btrblocks/tests/onpair_roundtrip.rs b/vortex-btrblocks/tests/onpair_roundtrip.rs index 5cae5f3df2b..3843e02c319 100644 --- a/vortex-btrblocks/tests/onpair_roundtrip.rs +++ b/vortex-btrblocks/tests/onpair_roundtrip.rs @@ -183,3 +183,46 @@ fn empty_and_short_string_roundtrip() { }) .unwrap(); } + +/// Regression for the Euro2016 compress-bench panic +/// (`onpair::decompress`: "dictionary offsets must be nondecreasing"). +/// +/// A large, high-cardinality corpus fills the OnPair dictionary toward its +/// 4096-entry cap, so the cascading compressor narrows `dict_offsets` to `u16` +/// and Delta-encodes it across multiple FastLanes chunks (len > 1024). The old +/// decode path widened it via `arr.cast(u32).execute()`, but the `Delta` cast +/// kernel preserves the Delta wrapping and only widens the inner bases/deltas +/// in place — and the transposed bases layout is keyed on `T::LANES`, which +/// differs between `u16` and `u32`. Decoding the widened Delta against the +/// misaligned layout yields non-monotonic offsets and trips the upstream +/// assert. The fix canonicalises each child to a `PrimitiveArray` first, then +/// widens element-wise. +#[test] +fn delta_dict_offsets_roundtrip() { + let n = 1usize << 16; + // Hex-encoded index plus a hashed suffix: every row is unique with enough + // shared structure to route through OnPair while filling the dictionary. + let strings: Vec = (0..n) + .map(|i| format!("{i:016x}-{:08x}", i.wrapping_mul(2654435761))) + .collect(); + let array = VarBinViewArray::from_iter( + strings.iter().map(|s| Some(s.as_str())), + DType::Utf8(Nullability::NonNullable), + ) + .into_array(); + let compressed = BtrBlocksCompressor::default() + .compress(&array, &mut SESSION.create_execution_ctx()) + .expect("compress"); + let decoded = compressed + .execute::(&mut SESSION.create_execution_ctx()) + .expect("decompress"); + assert_eq!(decoded.len(), n); + decoded + .with_iterator(|iter| { + for (i, got) in iter.enumerate() { + assert_eq!(got, Some(strings[i].as_bytes()), "row {i}"); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} From 3baf1b905193b80950b42d605e286ed2156632c9 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 14:14:59 +0100 Subject: [PATCH 11/27] fastlanes: don't widen Delta in place during cast (root cause of onpair panic) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `Delta`'s `CastReduce` widened a Delta array by casting its stored `bases`/`deltas` in place and rewrapping. That is unsound: the components are held in FastLanes transposed layout with `T::LANES` (= 1024 / bit_width) entries per chunk, and `T::LANES` changes with the target width. The re-widened buffers keep the source width's layout, but `delta_decompress` reads them with the target width's lane count — so for any array beyond a single near-empty chunk the decoded values are wrong. This silently corrupted data for every caller widening a Delta array; the OnPair decode path surfaced it as a panic ("dictionary offsets must be nondecreasing") because the corrupted offsets were non-monotonic (Euro2016 compress-bench). The pre-existing conformance tests only used <10-element arrays, where the in-place widen happens to land correctly, so the bug went unnoticed. Fix: serve only same-width casts (e.g. nullability changes) in place; defer every width change to the decompress-then-cast fallback (`Ok(None)`). There is no cheap correct in-place widen — re-laying-out the transpose is equivalent to decompressing. Add a multi-chunk (len > 1024) widening conformance test that fails against the old kernel. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Joe Isaacs --- encodings/fastlanes/src/delta/compute/cast.rs | 60 +++++++++++++++++-- 1 file changed, 55 insertions(+), 5 deletions(-) diff --git a/encodings/fastlanes/src/delta/compute/cast.rs b/encodings/fastlanes/src/delta/compute/cast.rs index 43a247df9f0..99c92a06603 100644 --- a/encodings/fastlanes/src/delta/compute/cast.rs +++ b/encodings/fastlanes/src/delta/compute/cast.rs @@ -20,18 +20,31 @@ impl CastReduce for Delta { }; let source_ptype = array.dtype().as_ptype(); - // TODO(DK): narrows can be safe but we must decompress to compute the maximum value. - if target_ptype.is_signed_int() || source_ptype.bit_width() > target_ptype.bit_width() { + // Only a same-width cast (e.g. a nullability change) can be served by + // re-casting the stored components in place. Any width change must defer + // to the decompress-then-cast fallback (`Ok(None)`): + // + // * Widening cannot be done in place. `bases`/`deltas` are held in + // FastLanes transposed layout with `T::LANES` (= 1024 / bit_width) + // entries per chunk, and `T::LANES` changes with the target width. + // Re-widening the buffers element-wise preserves the *source* width's + // layout, but `delta_decompress` then reads them with the *target* + // width's lane count, decoding against a misaligned layout and + // producing wrong (and, for `onpair` dictionary offsets, non-monotonic) + // values for any array larger than a single near-empty chunk. + // * Narrowing is unsafe without first decompressing to check the max + // value fits. + if source_ptype.bit_width() != target_ptype.bit_width() { return Ok(None); } - // Signed sources need a different cast policy than the lossless widening cast - // used here. The delta bytes are stored as the result of `wrapping_sub`, so e.g. + // Signed sources need a different cast policy than the lossless cast used + // here. The delta bytes are stored as the result of `wrapping_sub`, so e.g. // a delta of -1i8 has the bit pattern 0xFF. Widening *as a value* (the cast op's // semantics) sign-extends that to 0xFFFFFFFF, which means `wrapping_add(base, delta)` // at the wider type produces a different result than at the source type — round-trip // breaks. Cross-signedness widening has the same hazard for the same reason. Fall // back to decompress-and-re-encode for both cases. - if source_ptype.is_signed_int() { + if target_ptype.is_signed_int() || source_ptype.is_signed_int() { return Ok(None); } @@ -88,6 +101,43 @@ mod tests { assert_arrays_eq!(casted, PrimitiveArray::from_iter([10u32, 20, 30, 40, 50])); } + /// Widening across more than one FastLanes chunk (len > 1024). The in-place + /// component cast is invalid here because `T::LANES` differs between source + /// and target widths, so this must fall back to decompress-then-cast. A + /// previous in-place widen produced non-monotonic values and corrupted + /// round-trips (the `onpair` dictionary-offsets panic). + #[rstest] + #[case::u8_to_u32(8)] + #[case::u16_to_u32(16)] + fn test_cast_delta_widen_multichunk(#[case] src_width: u32) { + let n = 4096usize; + let expected: Vec = (0..n as u32).map(|i| (i * 3) % 60_000).collect(); + let delta = match src_width { + 8 => Delta::try_from_primitive_array( + &PrimitiveArray::from_iter((0..n).map(|i| ((i * 3) % 250) as u8)), + &mut SESSION.create_execution_ctx(), + ), + _ => Delta::try_from_primitive_array( + &PrimitiveArray::from_iter(expected.iter().map(|&v| v as u16)), + &mut SESSION.create_execution_ctx(), + ), + } + .unwrap(); + let expected: Vec = if src_width == 8 { + (0..n).map(|i| ((i * 3) % 250) as u32).collect() + } else { + expected + }; + + let casted = delta + .into_array() + .cast(DType::Primitive(PType::U32, Nullability::NonNullable)) + .unwrap() + .execute::(&mut SESSION.create_execution_ctx()) + .unwrap(); + assert_eq!(casted.as_slice::(), expected.as_slice()); + } + #[test] fn test_cast_delta_nullable() { // DeltaArray doesn't support nullable arrays - the validity is handled at the DeltaArray level From d31e4a72b88fe364cbf86dd38c79fd6bf22fea82 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 14:34:20 +0100 Subject: [PATCH 12/27] fix Signed-off-by: Joe Isaacs --- encodings/fastlanes/src/delta/vtable/rules.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/encodings/fastlanes/src/delta/vtable/rules.rs b/encodings/fastlanes/src/delta/vtable/rules.rs index d6892897ab5..e647e444bf3 100644 --- a/encodings/fastlanes/src/delta/vtable/rules.rs +++ b/encodings/fastlanes/src/delta/vtable/rules.rs @@ -9,5 +9,6 @@ use crate::delta::vtable::Delta; pub(crate) static RULES: ParentRuleSet = ParentRuleSet::new(&[ ParentRuleSet::lift(&SliceReduceAdaptor(Delta)), - ParentRuleSet::lift(&CastReduceAdaptor(Delta)), + // TODO(joe): fixme, this is incorrect.. + // ParentRuleSet::lift(&CastReduceAdaptor(Delta)), ]); From 467d976321b601117f30dcbea33b24015ec2fe81 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 14:39:32 +0100 Subject: [PATCH 13/27] fastlanes: silence unused import after disabling Delta cast rule Commenting out the `CastReduceAdaptor(Delta)` registration left `CastReduceAdaptor` imported-but-unused, which fails CI clippy under `-D warnings`. Keep the import (the commented registration still references it, so the rule can be re-enabled once the in-place widen is fixed) and mark it `#[allow(unused_imports)]`. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Joe Isaacs --- encodings/fastlanes/src/delta/vtable/rules.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/encodings/fastlanes/src/delta/vtable/rules.rs b/encodings/fastlanes/src/delta/vtable/rules.rs index e647e444bf3..adf0cd3cc6a 100644 --- a/encodings/fastlanes/src/delta/vtable/rules.rs +++ b/encodings/fastlanes/src/delta/vtable/rules.rs @@ -3,6 +3,9 @@ use vortex_array::arrays::slice::SliceReduceAdaptor; use vortex_array::optimizer::rules::ParentRuleSet; +// Kept (with the registration below) so the Delta cast rule can be re-enabled +// once the in-place widening is made correct; see the TODO below. +#[allow(unused_imports)] use vortex_array::scalar_fn::fns::cast::CastReduceAdaptor; use crate::delta::vtable::Delta; From 5a13e8e5d00fbc19ecc513ee0d701ed01550e680 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 15:47:55 +0100 Subject: [PATCH 14/27] fix Signed-off-by: Joe Isaacs --- vortex-btrblocks/src/builder.rs | 2 +- vortex-btrblocks/src/schemes/string.rs | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs index 6c4c12b8a4b..61c40341dbc 100644 --- a/vortex-btrblocks/src/builder.rs +++ b/vortex-btrblocks/src/builder.rs @@ -191,7 +191,7 @@ impl BtrBlocksCompressorBuilder { string::StringDictScheme.id(), string::FSSTScheme.id(), binary::BinaryDictScheme.id(), - ]); + ]; #[cfg(feature = "unstable_encodings")] excluded.push(string::OnPairScheme.id()); let builder = self.exclude_schemes(excluded); diff --git a/vortex-btrblocks/src/schemes/string.rs b/vortex-btrblocks/src/schemes/string.rs index 8304a1d21f1..8d83aaadf1d 100644 --- a/vortex-btrblocks/src/schemes/string.rs +++ b/vortex-btrblocks/src/schemes/string.rs @@ -318,7 +318,6 @@ mod onpair { use vortex_onpair::OnPairArraySlotsExt; use vortex_onpair::onpair_compress; - use super::is_utf8_string; use crate::ArrayAndStats; use crate::CascadingCompressor; use crate::CompressorContext; @@ -344,7 +343,7 @@ mod onpair { } fn matches(&self, canonical: &Canonical) -> bool { - is_utf8_string(canonical) + canonical.dtype().is_utf8() } /// 4 primitive slot children flow through the cascading compressor: @@ -372,7 +371,7 @@ mod onpair { compress_ctx: CompressorContext, exec_ctx: &mut ExecutionCtx, ) -> VortexResult { - let utf8 = data.array_as_utf8().into_owned(); + let utf8 = data.array_as_varbinview().into_owned(); let onpair_array = onpair_compress(&utf8, utf8.len(), utf8.dtype(), DEFAULT_DICT12_CONFIG)?; From 9be635befc171c9a88aa8b07f6f96b4842a35359 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 16:35:25 +0100 Subject: [PATCH 15/27] fix Signed-off-by: Joe Isaacs --- encodings/experimental/onpair/src/array.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/encodings/experimental/onpair/src/array.rs b/encodings/experimental/onpair/src/array.rs index 6183406cfae..188fff272de 100644 --- a/encodings/experimental/onpair/src/array.rs +++ b/encodings/experimental/onpair/src/array.rs @@ -126,6 +126,18 @@ pub struct OnPairSlots { /// Vortex slot child so it can be re-encoded by the cascading compressor. #[derive(Clone)] pub struct OnPairData { + /// The dictionary blob (buffer 0). + /// + /// INVARIANT: this buffer must be over-padded past its logical end + /// (`dict_offsets.last()`) by the decoder's fixed token read width, + /// [`MAX_TOKEN_SIZE`][crate::MAX_TOKEN_SIZE]. The over-copy decoder reads + /// every dictionary entry with one fixed-width load and then advances the + /// cursor by the token's true length, so the load for the final, shortest + /// token over-reads past the logical end of the dictionary. This is the + /// same over-read the decoder accounts for on the final few codes; the + /// trailing padding absorbs it so that any entry can be read in bounds. + /// `onpair_compress` establishes this padding (see `parts_to_children`); + /// the over-copy decoder lives in the `onpair` crate. dict_bytes: BufferHandle, bits: u32, len: usize, From 3b3a1698205cc94b2dbdaa69d6092e9830c461ac Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 17:12:28 +0100 Subject: [PATCH 16/27] fix Signed-off-by: Joe Isaacs --- Cargo.lock | 4 +- Cargo.toml | 2 +- .../experimental/onpair/src/canonical.rs | 14 ++- encodings/experimental/onpair/src/decode.rs | 100 +++++++++++------- 4 files changed, 73 insertions(+), 47 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 421572289e9..0382288dd9c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5725,9 +5725,9 @@ checksum = "cfe21416a02c693fb9f980befcb230ecc70b0b3d1cc4abf88b9675c4c1457f0c" [[package]] name = "onpair" -version = "0.0.2" +version = "0.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b630b38fb60f69bb1d6125b08cda2c93b350e6c90724af37c66dfb83e40c85e" +checksum = "c08c79a6daa8ac203293a6a231d8593b678c63ad3d7da6e25d1209ce750c9eb0" dependencies = [ "hashbrown 0.16.1", "rand 0.9.4", diff --git a/Cargo.toml b/Cargo.toml index a9253ec08dc..dd28de06b7c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -190,7 +190,7 @@ num_enum = { version = "0.7.3", default-features = false } object_store = { version = "0.13.1", default-features = false } once_cell = "1.21" oneshot = { version = "0.2.0", features = ["async"] } -onpair = { version = "0.0.2" } +onpair = { version = "0.0.3" } opentelemetry = "0.32.0" opentelemetry-otlp = "0.32.0" opentelemetry_sdk = "0.32.0" diff --git a/encodings/experimental/onpair/src/canonical.rs b/encodings/experimental/onpair/src/canonical.rs index 1801619aedf..c1d573fe663 100644 --- a/encodings/experimental/onpair/src/canonical.rs +++ b/encodings/experimental/onpair/src/canonical.rs @@ -6,6 +6,7 @@ use std::sync::Arc; +use num_traits::AsPrimitive; use onpair::DECOMPRESS_BUFFER_PADDING; use vortex_array::ArrayRef; use vortex_array::ArrayView; @@ -24,7 +25,7 @@ use vortex_error::VortexResult; use crate::OnPair; use crate::OnPairArraySlotsExt; -use crate::decode::OwnedDecodeInputs; +use crate::decode::FullDecodeInputs; pub(super) fn canonicalize_onpair( array: ArrayView<'_, OnPair>, @@ -48,8 +49,15 @@ pub(crate) fn onpair_decode_views( .clone() .execute::(ctx)?; - let inputs = OwnedDecodeInputs::collect(array, ctx)?; - let total_size = inputs.decompressed_len(); + let inputs = FullDecodeInputs::collect(array, ctx)?; + + let total_size: usize = match_each_integer_ptype!(lengths.ptype(), |P| { + lengths + .as_slice::

() + .iter() + .map(|&l| AsPrimitive::::as_(l)) + .sum() + }); let mut out_bytes = ByteBufferMut::with_capacity(total_size + DECOMPRESS_BUFFER_PADDING); let written = inputs.decompress_into(out_bytes.spare_capacity_mut()); diff --git a/encodings/experimental/onpair/src/decode.rs b/encodings/experimental/onpair/src/decode.rs index ecf26c73ec7..c76444d75b8 100644 --- a/encodings/experimental/onpair/src/decode.rs +++ b/encodings/experimental/onpair/src/decode.rs @@ -6,16 +6,15 @@ use std::mem::MaybeUninit; -use num_traits::AsPrimitive; use onpair::Parts; use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; use vortex_array::arrays::PrimitiveArray; +use vortex_array::builtins::ArrayBuiltins; +use vortex_array::dtype::DType; use vortex_array::dtype::NativePType; -use vortex_array::match_each_integer_ptype; use vortex_buffer::Buffer; -use vortex_buffer::BufferMut; use vortex_buffer::ByteBuffer; use vortex_error::VortexResult; @@ -31,46 +30,20 @@ pub struct OwnedDecodeInputs { pub bits: u32, } +/// Canonicalise a slot child to the decoder's native primitive width. +fn collect_widened( + arr: &ArrayRef, + ctx: &mut ExecutionCtx, +) -> VortexResult> { + let dtype = DType::Primitive(T::PTYPE, arr.dtype().nullability()); + Ok(arr + .cast(dtype)? + .execute::(ctx)? + .into_buffer::()) +} + impl OwnedDecodeInputs { pub fn collect(array: ArrayView<'_, OnPair>, ctx: &mut ExecutionCtx) -> VortexResult { - // Canonicalise each child to a PrimitiveArray first (decoding any - // cascading encoding the compressor chose — Delta, FastLanes bit-pack, - // narrowing — to absolute primitive values), then widen element-wise - // to the decoder's native width. Going through `cast(dtype).execute()` - // is unsafe here: the `Delta` cast kernel preserves the Delta wrapping - // and only widens the inner bases/deltas, but the fastlanes - // bases-per-chunk layout is keyed on LANES (e.g. u8 → 64, u32 → 16), - // so the widened Delta decodes against misaligned bases and produces - // non-monotonic offsets. - fn collect_widened( - arr: &ArrayRef, - ctx: &mut ExecutionCtx, - ) -> VortexResult> - where - u8: AsPrimitive, - i8: AsPrimitive, - u16: AsPrimitive, - i16: AsPrimitive, - u32: AsPrimitive, - i32: AsPrimitive, - u64: AsPrimitive, - i64: AsPrimitive, - { - let prim = arr.clone().execute::(ctx)?; - if prim.ptype() == T::PTYPE { - return Ok(prim.into_buffer::()); - } - Ok(match_each_integer_ptype!(prim.ptype(), |P| { - let slice = prim.as_slice::

(); - let mut out = BufferMut::::with_capacity(slice.len()); - for &v in slice { - // SAFETY: capacity reserved above. - unsafe { out.push_unchecked(v.as_()) }; - } - out.freeze() - })) - } - Ok(Self { dict_bytes: array.dict_bytes().clone(), dict_offsets: collect_widened::(array.dict_offsets(), ctx)?, @@ -116,3 +89,48 @@ impl OwnedDecodeInputs { } } } + +/// Inputs for whole-column decompression. +/// +/// Unlike [`OwnedDecodeInputs`], this deliberately omits the per-row +/// `code_boundaries` (`codes_offsets`) child: the contiguous +/// [`onpair::decompress_into`] decoder walks the flat `codes` stream directly +/// and never consults the per-row boundaries. Materialising that child for a +/// full canonicalisation is pure overhead — for a narrowed/bit-packed +/// `codes_offsets` it also forces an extra child `execute`. +pub struct FullDecodeInputs { + dict_bytes: ByteBuffer, + dict_offsets: Buffer, + codes: Buffer, + bits: u32, +} + +impl FullDecodeInputs { + pub fn collect(array: ArrayView<'_, OnPair>, ctx: &mut ExecutionCtx) -> VortexResult { + Ok(Self { + dict_bytes: array.dict_bytes().clone(), + dict_offsets: collect_widened::(array.dict_offsets(), ctx)?, + codes: collect_widened::(array.codes(), ctx)?, + bits: array.bits(), + }) + } + + /// Decode every row contiguously into `out`. Returns the number of + /// initialised bytes. + #[inline] + pub fn decompress_into(&self, out: &mut [MaybeUninit]) -> usize { + onpair::decompress_into(self.as_parts(), out) + } + + fn as_parts(&self) -> Parts<'_, u32> { + Parts { + dict_bytes: self.dict_bytes.as_slice(), + dict_offsets: self.dict_offsets.as_slice(), + bits: self.bits, + codes: self.codes.as_slice(), + // `decompress_into` never reads the per-row boundaries; an empty + // slice keeps the `Parts` well-typed without materialising them. + code_boundaries: &[], + } + } +} From 4de0ec15e9f671aef65df59955fc4ba3be7d5b9f Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 17:41:03 +0100 Subject: [PATCH 17/27] fix Signed-off-by: Joe Isaacs --- .../experimental/onpair/benches/decode.rs | 3 +- .../experimental/onpair/src/canonical.rs | 24 +++++-- encodings/experimental/onpair/src/decode.rs | 66 ++++++------------- encodings/experimental/onpair/src/ops.rs | 3 +- 4 files changed, 42 insertions(+), 54 deletions(-) diff --git a/encodings/experimental/onpair/benches/decode.rs b/encodings/experimental/onpair/benches/decode.rs index 6be4d0e82ad..f22724ed9e2 100644 --- a/encodings/experimental/onpair/benches/decode.rs +++ b/encodings/experimental/onpair/benches/decode.rs @@ -27,7 +27,6 @@ use std::sync::LazyLock; use divan::Bencher; -use onpair::DECOMPRESS_BUFFER_PADDING; use vortex_array::IntoArray; use vortex_array::VortexSessionExecute; use vortex_array::arrays::VarBinArray; @@ -149,7 +148,7 @@ fn decompress_into_bench(bencher: Bencher, case: (Shape, usize)) { let arr = compress(n, shape); let (inputs, total) = materialise(&arr); bencher.bench_local(|| { - let mut out: Vec = Vec::with_capacity(total + DECOMPRESS_BUFFER_PADDING); + let mut out: Vec = Vec::with_capacity(total); let written = inputs.decompress_into(out.spare_capacity_mut()); unsafe { out.set_len(written) }; divan::black_box(out); diff --git a/encodings/experimental/onpair/src/canonical.rs b/encodings/experimental/onpair/src/canonical.rs index c1d573fe663..5700bd2cf8f 100644 --- a/encodings/experimental/onpair/src/canonical.rs +++ b/encodings/experimental/onpair/src/canonical.rs @@ -7,7 +7,6 @@ use std::sync::Arc; use num_traits::AsPrimitive; -use onpair::DECOMPRESS_BUFFER_PADDING; use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; @@ -22,10 +21,11 @@ use vortex_buffer::Buffer; use vortex_buffer::ByteBuffer; use vortex_buffer::ByteBufferMut; use vortex_error::VortexResult; +use vortex_error::vortex_ensure; use crate::OnPair; use crate::OnPairArraySlotsExt; -use crate::decode::FullDecodeInputs; +use crate::decode::OwnedDecodeInputs; pub(super) fn canonicalize_onpair( array: ArrayView<'_, OnPair>, @@ -49,7 +49,7 @@ pub(crate) fn onpair_decode_views( .clone() .execute::(ctx)?; - let inputs = FullDecodeInputs::collect(array, ctx)?; + let inputs = OwnedDecodeInputs::collect(array, ctx)?; let total_size: usize = match_each_integer_ptype!(lengths.ptype(), |P| { lengths @@ -59,8 +59,22 @@ pub(crate) fn onpair_decode_views( .sum() }); - let mut out_bytes = ByteBufferMut::with_capacity(total_size + DECOMPRESS_BUFFER_PADDING); - let written = inputs.decompress_into(out_bytes.spare_capacity_mut()); + let code_start = inputs.code_boundaries.first().copied().unwrap_or_default() as usize; + let code_end = inputs.code_boundaries.last().copied().unwrap_or_default() as usize; + vortex_ensure!( + code_start <= code_end, + "OnPair codes_offsets must be nondecreasing" + ); + vortex_ensure!( + code_end <= inputs.codes.len(), + "OnPair codes_offsets end {} exceeds codes len {}", + code_end, + inputs.codes.len() + ); + + let mut out_bytes = ByteBufferMut::with_capacity(total_size); + let written = + inputs.decompress_code_range_into(code_start..code_end, out_bytes.spare_capacity_mut()); debug_assert_eq!(written, total_size); // SAFETY: `decompress_into` initialised exactly `written` bytes of the // spare capacity reserved above. diff --git a/encodings/experimental/onpair/src/decode.rs b/encodings/experimental/onpair/src/decode.rs index c76444d75b8..437e09f8f08 100644 --- a/encodings/experimental/onpair/src/decode.rs +++ b/encodings/experimental/onpair/src/decode.rs @@ -5,6 +5,7 @@ //! decompression API. use std::mem::MaybeUninit; +use std::ops::Range; use onpair::Parts; use vortex_array::ArrayRef; @@ -72,6 +73,26 @@ impl OwnedDecodeInputs { onpair::decompress_into(self.as_parts(), out) } + /// Decode a contiguous code window into `out`. Returns the number of + /// initialised bytes. + #[inline] + pub fn decompress_code_range_into( + &self, + range: Range, + out: &mut [MaybeUninit], + ) -> usize { + onpair::decompress_into( + Parts:: { + dict_bytes: self.dict_bytes.as_slice(), + dict_offsets: self.dict_offsets.as_slice(), + bits: self.bits, + codes: &self.codes.as_slice()[range], + code_boundaries: &[], + }, + out, + ) + } + /// Decode a single row into `out`. Returns the number of initialised /// bytes. #[inline] @@ -89,48 +110,3 @@ impl OwnedDecodeInputs { } } } - -/// Inputs for whole-column decompression. -/// -/// Unlike [`OwnedDecodeInputs`], this deliberately omits the per-row -/// `code_boundaries` (`codes_offsets`) child: the contiguous -/// [`onpair::decompress_into`] decoder walks the flat `codes` stream directly -/// and never consults the per-row boundaries. Materialising that child for a -/// full canonicalisation is pure overhead — for a narrowed/bit-packed -/// `codes_offsets` it also forces an extra child `execute`. -pub struct FullDecodeInputs { - dict_bytes: ByteBuffer, - dict_offsets: Buffer, - codes: Buffer, - bits: u32, -} - -impl FullDecodeInputs { - pub fn collect(array: ArrayView<'_, OnPair>, ctx: &mut ExecutionCtx) -> VortexResult { - Ok(Self { - dict_bytes: array.dict_bytes().clone(), - dict_offsets: collect_widened::(array.dict_offsets(), ctx)?, - codes: collect_widened::(array.codes(), ctx)?, - bits: array.bits(), - }) - } - - /// Decode every row contiguously into `out`. Returns the number of - /// initialised bytes. - #[inline] - pub fn decompress_into(&self, out: &mut [MaybeUninit]) -> usize { - onpair::decompress_into(self.as_parts(), out) - } - - fn as_parts(&self) -> Parts<'_, u32> { - Parts { - dict_bytes: self.dict_bytes.as_slice(), - dict_offsets: self.dict_offsets.as_slice(), - bits: self.bits, - codes: self.codes.as_slice(), - // `decompress_into` never reads the per-row boundaries; an empty - // slice keeps the `Parts` well-typed without materialising them. - code_boundaries: &[], - } - } -} diff --git a/encodings/experimental/onpair/src/ops.rs b/encodings/experimental/onpair/src/ops.rs index c947de84ec8..e57ce3a5934 100644 --- a/encodings/experimental/onpair/src/ops.rs +++ b/encodings/experimental/onpair/src/ops.rs @@ -1,7 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use onpair::DECOMPRESS_BUFFER_PADDING; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; use vortex_array::arrays::varbin::varbin_scalar; @@ -21,7 +20,7 @@ impl OperationsVTable for OnPair { ) -> VortexResult { let inputs = OwnedDecodeInputs::collect(array, ctx)?; let len = inputs.decompressed_row_len(index); - let mut buf: Vec = Vec::with_capacity(len + DECOMPRESS_BUFFER_PADDING); + let mut buf: Vec = Vec::with_capacity(len); let written = inputs.decompress_row_into(index, buf.spare_capacity_mut()); debug_assert_eq!(written, len); // SAFETY: `decompress_row_into` initialised `written` bytes of the From 9259c562a61549292c2fc1c5862290a521a1025a Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 20:51:27 +0100 Subject: [PATCH 18/27] fix Signed-off-by: Joe Isaacs --- Cargo.lock | 738 ++++++++++++------ Cargo.toml | 2 +- .../experimental/onpair/benches/decode.rs | 71 +- encodings/experimental/onpair/public-api.lock | 216 +++-- .../experimental/onpair/src/canonical.rs | 42 +- encodings/experimental/onpair/src/compress.rs | 56 +- encodings/experimental/onpair/src/decode.rs | 105 +-- encodings/experimental/onpair/src/lib.rs | 2 +- encodings/experimental/onpair/src/ops.rs | 32 +- encodings/experimental/onpair/src/tests.rs | 97 +++ 10 files changed, 862 insertions(+), 499 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0382288dd9c..595f115f820 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -228,25 +228,57 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" +[[package]] +name = "arrow" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bd47f2a6ddc39244bd722a27ee5da66c03369d087b9e024eafdb03e98b98ea7" +dependencies = [ + "arrow-arith 57.3.1", + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-cast 57.3.1", + "arrow-data 57.3.1", + "arrow-ord 57.3.1", + "arrow-row 57.3.1", + "arrow-schema 57.3.1", + "arrow-select 57.3.1", + "arrow-string 57.3.1", +] + [[package]] name = "arrow" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "378530e55cd479eda3c14eb345310799717e6f76d0c332041e8487022166b471" dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", "arrow-csv", - "arrow-data", - "arrow-ipc", + "arrow-data 58.3.0", + "arrow-ipc 58.3.0", "arrow-json", - "arrow-ord", - "arrow-row", - "arrow-schema", - "arrow-select", - "arrow-string", + "arrow-ord 58.3.0", + "arrow-row 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", + "arrow-string 58.3.0", +] + +[[package]] +name = "arrow-arith" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c7bbd679c5418b8639b92be01f361d60013c4906574b578b77b63c78356594c" +dependencies = [ + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-schema 57.3.1", + "chrono", + "num-traits", ] [[package]] @@ -255,14 +287,32 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a0ab212d2c1886e802f51c5212d78ebbcbb0bec980fff9dadc1eb8d45cd0b738" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "chrono", "num-traits", ] +[[package]] +name = "arrow-array" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8a4ab47b3f3eac60f7fd31b81e9028fda018607bcc63451aca4f2b755269862" +dependencies = [ + "ahash 0.8.12", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-schema 57.3.1", + "chrono", + "half", + "hashbrown 0.16.1", + "num-complex", + "num-integer", + "num-traits", +] + [[package]] name = "arrow-array" version = "58.3.0" @@ -270,9 +320,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfd33d3e92f207444098c75b42de99d329562be0cf686b307b097cc52b4e999e" dependencies = [ "ahash 0.8.12", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "chrono", "chrono-tz", "half", @@ -282,6 +332,18 @@ dependencies = [ "num-traits", ] +[[package]] +name = "arrow-buffer" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d18b89b4c4f4811d0858175e79541fe98e33e18db3b011708bc287b1240593f" +dependencies = [ + "bytes", + "half", + "num-bigint", + "num-traits", +] + [[package]] name = "arrow-buffer" version = "58.3.0" @@ -294,18 +356,40 @@ dependencies = [ "num-traits", ] +[[package]] +name = "arrow-cast" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "722b5c41dd1d14d0a879a1bce92c6fe33f546101bb2acce57a209825edd075b3" +dependencies = [ + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-ord 57.3.1", + "arrow-schema 57.3.1", + "arrow-select 57.3.1", + "atoi", + "base64", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num-traits", + "ryu", +] + [[package]] name = "arrow-cast" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c5aefb56a2c02e9e2b30746241058b85f8983f0fcff2ba0c6d09006e1cded7f" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "atoi", "base64", "chrono", @@ -322,41 +406,68 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e94e8cf7e517657a52b91ea1263acf38c4ca62a84655d72458a3359b12ab97de" dependencies = [ - "arrow-array", - "arrow-cast", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-cast 58.3.0", + "arrow-schema 58.3.0", "chrono", "csv", "csv-core", "regex", ] +[[package]] +name = "arrow-data" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1683705c63dcf0d18972759eda48489028cbbff67af7d6bef2c6b7b74ab778a" +dependencies = [ + "arrow-buffer 57.3.1", + "arrow-schema 57.3.1", + "half", + "num-integer", + "num-traits", +] + [[package]] name = "arrow-data" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c88210023a2bfee1896af366309a3028fc3bcbd6515fa29a7990ee1baa08ee0" dependencies = [ - "arrow-buffer", - "arrow-schema", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "half", "num-integer", "num-traits", ] +[[package]] +name = "arrow-ipc" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8cf72d04c07229fbf4dbebe7145cac37d7cf7ec582fe705c6b92cb314af096ab" +dependencies = [ + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-schema 57.3.1", + "arrow-select 57.3.1", + "flatbuffers", +] + [[package]] name = "arrow-ipc" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "238438f0834483703d88896db6fe5a7138b2230debc31b34c0336c2996e3c64f" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "flatbuffers", - "lz4_flex", + "lz4_flex 0.13.1", "zstd", ] @@ -366,12 +477,12 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "205ca2119e6d679d5c133c6f30e68f027738d95ed948cf77677ea69c7800036b" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "chrono", "half", "indexmap 2.14.0", @@ -385,17 +496,43 @@ dependencies = [ "simdutf8", ] +[[package]] +name = "arrow-ord" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "082342947d4e5a2bcccf029a0a0397e21cb3bb8421edd9571d34fb5dd2670256" +dependencies = [ + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-schema 57.3.1", + "arrow-select 57.3.1", +] + [[package]] name = "arrow-ord" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1bffd8fd2579286a5d63bac898159873e5094a79009940bcb42bbfce4f19f1d0" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", +] + +[[package]] +name = "arrow-row" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a931b520a2a5e22033e01a6f2486b4cdc26f9106b759abeebc320f125e94d7" +dependencies = [ + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-schema 57.3.1", + "half", ] [[package]] @@ -404,13 +541,19 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bab5994731204603c73ba69267616c50f80780774c6bb0476f1f830625115e0c" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "half", ] +[[package]] +name = "arrow-schema" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4cf0d4a6609679e03002167a61074a21d7b1ad9ea65e462b2c0a97f8a3b2bc6" + [[package]] name = "arrow-schema" version = "58.3.0" @@ -422,6 +565,20 @@ dependencies = [ "serde_json", ] +[[package]] +name = "arrow-select" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b320d86a9806923663bb0fd9baa65ecaba81cb0cd77ff8c1768b9716b4ef891" +dependencies = [ + "ahash 0.8.12", + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-schema 57.3.1", + "num-traits", +] + [[package]] name = "arrow-select" version = "58.3.0" @@ -429,24 +586,41 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8cd065c54172ac787cf3f2f8d4107e0d3fdc26edba76fdf4f4cc170258942222" dependencies = [ "ahash 0.8.12", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "num-traits", ] +[[package]] +name = "arrow-string" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b493e99162e5764077e7823e50ba284858d365922631c7aaefe9487b1abd02c2" +dependencies = [ + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-schema 57.3.1", + "arrow-select 57.3.1", + "memchr", + "num-traits", + "regex", + "regex-syntax", +] + [[package]] name = "arrow-string" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "29dd7cda3ab9692f43a2e4acc444d760cc17b12bb6d8232ddf64e9bab7c06b42" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "memchr", "num-traits", "regex", @@ -1463,8 +1637,8 @@ name = "compress-bench" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-trait", "bytes", "clap", @@ -1472,7 +1646,7 @@ dependencies = [ "indicatif", "itertools 0.14.0", "lance-bench", - "parquet", + "parquet 58.3.0", "regex", "tokio", "tracing", @@ -1975,8 +2149,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93db0e623840612f7f2cd757f7e8a8922064192363732c88692e0870016e141b" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "async-trait", "bytes", "bzip2", @@ -2014,7 +2188,7 @@ dependencies = [ "log", "object_store 0.13.2", "parking_lot", - "parquet", + "parquet 58.3.0", "rand 0.9.4", "regex", "sqlparser", @@ -2057,7 +2231,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37cefde60b26a7f4ff61e9d2ff2833322f91df2b568d7238afe67bde5bdffb66" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "dashmap", "datafusion-common", @@ -2082,7 +2256,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "17e112307715d6a7a331111a4c2330ff54bc237183511c319e3708a4cff431fb" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "datafusion-catalog", "datafusion-common", @@ -2107,8 +2281,8 @@ checksum = "d72a11ca44a95e1081870d3abb80c717496e8a7acb467a1d3e932bb636af5cc2" dependencies = [ "ahash 0.8.12", "apache-avro", - "arrow", - "arrow-ipc", + "arrow 58.3.0", + "arrow-ipc 58.3.0", "chrono", "half", "hashbrown 0.16.1", @@ -2117,7 +2291,7 @@ dependencies = [ "libc", "log", "object_store 0.13.2", - "parquet", + "parquet 58.3.0", "paste", "recursive", "sqlparser", @@ -2142,7 +2316,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" dependencies = [ - "arrow", + "arrow 58.3.0", "async-compression", "async-trait", "bytes", @@ -2177,8 +2351,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffa6c52cfed0734c5f93754d1c0175f558175248bf686c944fb05c373e5fc096" dependencies = [ - "arrow", - "arrow-ipc", + "arrow 58.3.0", + "arrow-ipc 58.3.0", "async-trait", "bytes", "datafusion-common", @@ -2202,7 +2376,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a579c3bd290c66ea4b269493e75e8a3ed42c9c895a651f10210a29538aee50c4" dependencies = [ "apache-avro", - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common", @@ -2221,7 +2395,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "503f29e0582c1fc189578d665ff57d9300da1f80c282777d7eb67bb79fb8cdca" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common", @@ -2244,7 +2418,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e33804749abc8d0c8cb7473228483cb8070e524c6f6086ee1b85a64debe2b3d2" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common", @@ -2268,7 +2442,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common", @@ -2288,7 +2462,7 @@ dependencies = [ "log", "object_store 0.13.2", "parking_lot", - "parquet", + "parquet 58.3.0", "tokio", ] @@ -2304,8 +2478,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c03c7fbdaefcca4ef6ffe425a5fc2325763bfb426599bb0bf4536466efabe709" dependencies = [ - "arrow", - "arrow-buffer", + "arrow 58.3.0", + "arrow-buffer 58.3.0", "async-trait", "chrono", "dashmap", @@ -2327,7 +2501,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "574b9b6977fedbd2a611cbff12e5caf90f31640ad9dc5870f152836d94bad0dd" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "chrono", "datafusion-common", @@ -2350,7 +2524,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d7c3adf3db8bf61e92eb90cb659c8e8b734593a8f7c8e12a843c7ddba24b87e" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common", "indexmap 2.14.0", "itertools 0.14.0", @@ -2363,8 +2537,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f28aa4e10384e782774b10e72aca4d93ef7b31aa653095d9d4536b0a3dbc51b6" dependencies = [ - "arrow", - "arrow-buffer", + "arrow 58.3.0", + "arrow-buffer 58.3.0", "base64", "blake2", "blake3", @@ -2396,7 +2570,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "00aa6217e56098ba84e0a338176fe52f0a84cca398021512c6c8c5eff806d0ad" dependencies = [ "ahash 0.8.12", - "arrow", + "arrow 58.3.0", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -2418,7 +2592,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b511250349407db7c43832ab2de63f5557b19a20dfd236b39ca2c04468b50d47" dependencies = [ "ahash 0.8.12", - "arrow", + "arrow 58.3.0", "datafusion-common", "datafusion-expr-common", "datafusion-physical-expr-common", @@ -2430,8 +2604,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef13a858e20d50f0a9bb5e96e7ac82b4e7597f247515bccca4fdd2992df0212a" dependencies = [ - "arrow", - "arrow-ord", + "arrow 58.3.0", + "arrow-ord 58.3.0", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -2455,7 +2629,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b40d3f5bbb3905f9ccb1ce9485a9595c77b69758a7c24d3ba79e334ff51e7e" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "datafusion-catalog", "datafusion-common", @@ -2471,7 +2645,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4e88ec9d57c9b685d02f58bfee7be62d72610430ddcedb82a08e5d9925dbfb6" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common", "datafusion-doc", "datafusion-expr", @@ -2510,7 +2684,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e929015451a67f77d9d8b727b2bf3a40c4445fdef6cdc53281d7d97c76888ace" dependencies = [ - "arrow", + "arrow 58.3.0", "chrono", "datafusion-common", "datafusion-expr", @@ -2531,7 +2705,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b1e68aba7a4b350401cfdf25a3d6f989ad898a7410164afe9ca52080244cb59" dependencies = [ "ahash 0.8.12", - "arrow", + "arrow 58.3.0", "datafusion-common", "datafusion-expr", "datafusion-expr-common", @@ -2554,7 +2728,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea22315f33cf2e0adc104e8ec42e285f6ed93998d565c65e82fec6a9ee9f9db4" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common", "datafusion-expr", "datafusion-functions", @@ -2570,7 +2744,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b04b45ea8ad3ac2d78f2ea2a76053e06591c9629c7a603eda16c10649ecf4362" dependencies = [ "ahash 0.8.12", - "arrow", + "arrow 58.3.0", "chrono", "datafusion-common", "datafusion-expr-common", @@ -2586,7 +2760,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7cb13397809a425918f608dfe8653f332015a3e330004ab191b4404187238b95" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common", "datafusion-execution", "datafusion-expr", @@ -2606,9 +2780,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5edc023675791af9d5fb4cc4c24abf5f7bd3bd4dcf9e5bd90ea1eff6976dcc79" dependencies = [ "ahash 0.8.12", - "arrow", - "arrow-ord", - "arrow-schema", + "arrow 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", "async-trait", "datafusion-common", "datafusion-common-runtime", @@ -2637,7 +2811,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac8c76860e355616555081cab5968cec1af7a80701ff374510860bcd567e365a" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common", "datafusion-datasource", "datafusion-expr-common", @@ -2668,7 +2842,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e059dcf8544da0d6598d0235be3cc29c209094a5976b2e4822e4a2cf91c2b5c5" dependencies = [ - "arrow", + "arrow 58.3.0", "bigdecimal", "chrono", "crc32fast", @@ -2695,7 +2869,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa0d133ddf8b9b3b872acac900157f783e7b879fe9a6bccf389abebbfac45ec1" dependencies = [ - "arrow", + "arrow 58.3.0", "bigdecimal", "chrono", "datafusion-common", @@ -2714,7 +2888,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04e5a4a7a49143a68936992b6dbb0db44121c635e9992b2482817278f1e69c56" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bigdecimal", "clap", @@ -2911,7 +3085,7 @@ version = "1.10502.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fdc796383b176dd5a45353fbb5e64583c0ee4da12cb62c9e510b785324b2488" dependencies = [ - "arrow", + "arrow 58.3.0", "cast", "comfy-table", "fallible-iterator", @@ -3266,7 +3440,7 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "83cf860f6a6bf0a6a60fdfe5a36c75121fad5ea4332d1d12deee3e65b6047727" dependencies = [ - "arrow-array", + "arrow-array 58.3.0", "rand 0.9.4", ] @@ -4301,16 +4475,16 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d34e854994e84d043897f5ec9fb609221e9e69e3fd52996cd715d979fcd349f6" dependencies = [ - "arrow", - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-ipc", - "arrow-ord", - "arrow-row", - "arrow-schema", - "arrow-select", + "arrow 58.3.0", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-ipc 58.3.0", + "arrow-ord 58.3.0", + "arrow-row 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-recursion", "async-trait", "async_cell", @@ -4369,14 +4543,14 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7827fe404358c27d120ee8ea8ef7b9415c2911d54072bec83dd689d750ae65da" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-ipc", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-data 58.3.0", + "arrow-ipc 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "bytes", "futures", "getrandom 0.2.17", @@ -4391,13 +4565,13 @@ name = "lance-bench" version = "0.1.0" dependencies = [ "anyhow", - "arrow-cast", + "arrow-cast 58.3.0", "async-trait", "clap", "futures", "lance", "lance-encoding", - "parquet", + "parquet 58.3.0", "tempfile", "tokio", "tracing", @@ -4421,9 +4595,9 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b128c213c676cb8e03c62a68670642770825171e64097cc2da97cbb19fe35d29" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "async-trait", "byteorder", "bytes", @@ -4460,13 +4634,13 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e03b2de71cbcd09b10bf1a17c83cacbc0176ecd97203fb72b9e59d9b8f9a3743" dependencies = [ - "arrow", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-trait", "chrono", "datafusion", @@ -4493,10 +4667,10 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2fe7c7ea7fd397e495a1646fec360e46ee0cbd75718f1c0e887aad657c5f2944" dependencies = [ - "arrow", - "arrow-array", - "arrow-cast", - "arrow-schema", + "arrow 58.3.0", + "arrow-array 58.3.0", + "arrow-cast 58.3.0", + "arrow-schema 58.3.0", "chrono", "futures", "half", @@ -4513,13 +4687,13 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fe3f8070835b407d8db9ea8728386bc3207ba23c66a9c22d344e231ef12b77ca" dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "bytemuck", "byteorder", "bytes", @@ -4552,12 +4726,12 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6dfcf654549330df3aef708cd7c12e170feecddd34d6c19dd005b4153213268" dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-recursion", "async-trait", "byteorder", @@ -4586,12 +4760,12 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4fb8ad0bd10efa2608634a2518b7dd501231e76c56a65fbd6519e23914cc425a" dependencies = [ - "arrow", - "arrow-arith", - "arrow-array", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow 58.3.0", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-channel", "async-recursion", "async-trait", @@ -4652,14 +4826,14 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef5314703fa8c8baed04193cc669da80ab42521c6319d3cc921a4a997690dcc0" dependencies = [ - "arrow", - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow 58.3.0", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-recursion", "async-trait", "byteorder", @@ -4694,9 +4868,9 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51aa9b73279f505b2bec0f194c7a2390ca74ad3260131e631a7bef8d97d54b2e" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "cc", "deepsize", "half", @@ -4712,7 +4886,7 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cd01581f55ce45c49cbe494ee86c7ba7ca4ca3654690fd820941cd9105a46e" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "lance-core", @@ -4741,11 +4915,11 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5db70650465a1af174b7dfe6948ec91a3d466ada12e11274eb66e51132173aa0" dependencies = [ - "arrow", - "arrow-array", - "arrow-buffer", - "arrow-ipc", - "arrow-schema", + "arrow 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-ipc 58.3.0", + "arrow-schema 58.3.0", "async-trait", "byteorder", "bytes", @@ -5108,6 +5282,15 @@ dependencies = [ "libc", ] +[[package]] +name = "lz4_flex" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90071f8077f8e40adfc4b7fe9cd495ce316263f19e75c2211eeff3fdf475a3d9" +dependencies = [ + "twox-hash", +] + [[package]] name = "lz4_flex" version = "0.13.1" @@ -5726,11 +5909,16 @@ checksum = "cfe21416a02c693fb9f980befcb230ecc70b0b3d1cc4abf88b9675c4c1457f0c" [[package]] name = "onpair" version = "0.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08c79a6daa8ac203293a6a231d8593b678c63ad3d7da6e25d1209ce750c9eb0" dependencies = [ + "arrow-array 57.3.1", + "arrow-schema 57.3.1", + "codspeed-divan-compat", "hashbrown 0.16.1", + "parquet 57.3.1", "rand 0.9.4", + "rstest", + "tpchgen 2.0.2 (registry+https://github.com/rust-lang/crates.io-index)", + "tpchgen-arrow 2.0.2 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -5913,6 +6101,40 @@ dependencies = [ "windows-link", ] +[[package]] +name = "parquet" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e832c6aa20310fc6de7ea5a3f4e20d34fd83e3b43229d32b81ffe5c14d74692" +dependencies = [ + "ahash 0.8.12", + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-cast 57.3.1", + "arrow-data 57.3.1", + "arrow-ipc 57.3.1", + "arrow-schema 57.3.1", + "arrow-select 57.3.1", + "base64", + "brotli", + "bytes", + "chrono", + "flate2", + "half", + "hashbrown 0.16.1", + "lz4_flex 0.12.2", + "num-bigint", + "num-integer", + "num-traits", + "paste", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "twox-hash", + "zstd", +] + [[package]] name = "parquet" version = "58.3.0" @@ -5920,12 +6142,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5dafa7d01085b62a47dd0c1829550a0a36710ea9c4fe358a05a85477cec8a908" dependencies = [ "ahash 0.8.12", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ipc", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-ipc 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "base64", "brotli", "bytes", @@ -5934,7 +6156,7 @@ dependencies = [ "futures", "half", "hashbrown 0.17.1", - "lz4_flex", + "lz4_flex 0.13.1", "num-bigint", "num-integer", "num-traits", @@ -5955,8 +6177,8 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74c8db065291f088a2aad8ab831853eae1871c0d311c8d0b83bbc3b7e735d0fc" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "chrono", "half", "indexmap 2.14.0", @@ -5971,8 +6193,8 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a530e8d5b5e14efcb39c9a6ec55432ad11f6afb7dc4455a79be0dc615fe3cc31" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "chrono", "half", "indexmap 2.14.0", @@ -5988,7 +6210,7 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "00ed89908289f67caa2ca078f9ff9aacd6229a313ec92b12bf4f48f613dc2b97" dependencies = [ - "arrow-schema", + "arrow-schema 58.3.0", "base64", "chrono", "parquet-variant", @@ -8717,18 +8939,34 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" +[[package]] +name = "tpchgen" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d651db770ccf53b89dd769ed47899c0c089452e3b725c3c48fbc6a2be579638" + [[package]] name = "tpchgen" version = "2.0.2" source = "git+https://github.com/clflushopt/tpchgen-rs.git?rev=438e9c2dbc25b2fff82c0efc08b3f13b5707874f#438e9c2dbc25b2fff82c0efc08b3f13b5707874f" +[[package]] +name = "tpchgen-arrow" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "180f3759dffbf26d47021d2a84245a00f20945384bcf22e63c32652b04916e5a" +dependencies = [ + "arrow 57.3.1", + "tpchgen 2.0.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "tpchgen-arrow" version = "2.0.2" source = "git+https://github.com/clflushopt/tpchgen-rs.git?rev=438e9c2dbc25b2fff82c0efc08b3f13b5707874f#438e9c2dbc25b2fff82c0efc08b3f13b5707874f" dependencies = [ - "arrow", - "tpchgen", + "arrow 58.3.0", + "tpchgen 2.0.2 (git+https://github.com/clflushopt/tpchgen-rs.git?rev=438e9c2dbc25b2fff82c0efc08b3f13b5707874f)", ] [[package]] @@ -9058,13 +9296,13 @@ name = "vector-search-bench" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "clap", "futures", "indicatif", - "parquet", + "parquet 58.3.0", "rand 0.10.1", "serde", "tabled", @@ -9088,12 +9326,12 @@ name = "vortex" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", + "arrow-array 58.3.0", "codspeed-divan-compat", "fastlanes", "futures", "mimalloc", - "parquet", + "parquet 58.3.0", "paste", "rand 0.10.1", "rand_distr 0.6.0", @@ -9160,15 +9398,15 @@ dependencies = [ "arbitrary", "arc-swap", "arcref", - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-ord", - "arrow-schema", - "arrow-select", - "arrow-string", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-data 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", + "arrow-string 58.3.0", "async-lock", "bytes", "cfg-if", @@ -9232,9 +9470,9 @@ name = "vortex-bench" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-trait", "bytes", "bzip2", @@ -9250,7 +9488,7 @@ dependencies = [ "noodles-bgzf", "noodles-vcf", "parking_lot", - "parquet", + "parquet 58.3.0", "rand 0.10.1", "regex", "reqwest 0.13.4", @@ -9262,8 +9500,8 @@ dependencies = [ "tokio", "tokio-stream", "tokio-util", - "tpchgen", - "tpchgen-arrow", + "tpchgen 2.0.2 (git+https://github.com/clflushopt/tpchgen-rs.git?rev=438e9c2dbc25b2fff82c0efc08b3f13b5707874f)", + "tpchgen-arrow 2.0.2 (git+https://github.com/clflushopt/tpchgen-rs.git?rev=438e9c2dbc25b2fff82c0efc08b3f13b5707874f)", "tracing", "tracing-perfetto", "tracing-subscriber", @@ -9278,9 +9516,9 @@ name = "vortex-bench-migrate" version = "0.1.0-alpha.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "clap", "duckdb", "flate2", @@ -9366,7 +9604,7 @@ dependencies = [ name = "vortex-buffer" version = "0.1.0" dependencies = [ - "arrow-buffer", + "arrow-buffer 58.3.0", "bitvec", "bytes", "codspeed-divan-compat", @@ -9397,21 +9635,21 @@ dependencies = [ name = "vortex-compat" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-select", + "arrow-array 58.3.0", + "arrow-select 58.3.0", "base16ct", "bytes", "clap", "futures", - "parquet", + "parquet 58.3.0", "reqwest 0.13.4", "serde", "serde_json", "sha2 0.11.0", "tempfile", "tokio", - "tpchgen", - "tpchgen-arrow", + "tpchgen 2.0.2 (git+https://github.com/clflushopt/tpchgen-rs.git?rev=438e9c2dbc25b2fff82c0efc08b3f13b5707874f)", + "tpchgen-arrow 2.0.2 (git+https://github.com/clflushopt/tpchgen-rs.git?rev=438e9c2dbc25b2fff82c0efc08b3f13b5707874f)", "vortex", "vortex-array", "vortex-buffer", @@ -9456,7 +9694,7 @@ name = "vortex-cuda" version = "0.1.0" dependencies = [ "arc-swap", - "arrow-schema", + "arrow-schema 58.3.0", "async-trait", "bindgen", "bytes", @@ -9496,8 +9734,8 @@ name = "vortex-cxx" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-fs", "cxx", "futures", @@ -9511,7 +9749,7 @@ name = "vortex-datafusion" version = "0.1.0" dependencies = [ "anyhow", - "arrow-schema", + "arrow-schema 58.3.0", "async-trait", "datafusion", "datafusion-catalog", @@ -9607,7 +9845,7 @@ dependencies = [ name = "vortex-error" version = "0.1.0" dependencies = [ - "arrow-schema", + "arrow-schema 58.3.0", "flatbuffers", "jiff", "object_store 0.13.2", @@ -9642,8 +9880,8 @@ dependencies = [ name = "vortex-ffi" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-fs", "cbindgen", "futures", @@ -9811,8 +10049,8 @@ dependencies = [ name = "vortex-jni" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-fs", "futures", "jni", @@ -9830,8 +10068,8 @@ name = "vortex-layout" version = "0.1.0" dependencies = [ "arcref", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-stream", "async-trait", "bit-vec", @@ -9872,7 +10110,7 @@ dependencies = [ name = "vortex-mask" version = "0.1.0" dependencies = [ - "arrow-buffer", + "arrow-buffer 58.3.0", "codspeed-divan-compat", "itertools 0.14.0", "rstest", @@ -9922,9 +10160,9 @@ dependencies = [ name = "vortex-parquet-variant" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "chrono", "parquet-variant", "parquet-variant-compute", @@ -9968,9 +10206,9 @@ dependencies = [ name = "vortex-python" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-data", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "async-fs", "bytes", "itertools 0.14.0", @@ -9993,8 +10231,8 @@ name = "vortex-runend" version = "0.1.0" dependencies = [ "arbitrary", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "codspeed-divan-compat", "itertools 0.14.0", "num-traits", @@ -10092,8 +10330,8 @@ dependencies = [ name = "vortex-tensor" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "codspeed-divan-compat", "half", "itertools 0.14.0", @@ -10117,8 +10355,8 @@ dependencies = [ name = "vortex-test-e2e-cuda" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "futures", "vortex", "vortex-cuda", @@ -10129,8 +10367,8 @@ name = "vortex-tui" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "clap", "console_error_panic_hook", "crossterm 0.29.0", @@ -10143,7 +10381,7 @@ dependencies = [ "indicatif", "itertools 0.14.0", "js-sys", - "parquet", + "parquet 58.3.0", "ratatui", "ratzilla", "serde", @@ -10193,9 +10431,9 @@ dependencies = [ name = "vortex-web-wasm" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-ipc", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-ipc 58.3.0", + "arrow-schema 58.3.0", "console_error_panic_hook", "futures", "js-sys", diff --git a/Cargo.toml b/Cargo.toml index dd28de06b7c..a696d96d8ae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -190,7 +190,7 @@ num_enum = { version = "0.7.3", default-features = false } object_store = { version = "0.13.1", default-features = false } once_cell = "1.21" oneshot = { version = "0.2.0", features = ["async"] } -onpair = { version = "0.0.3" } +onpair = { path = "../onpair" } opentelemetry = "0.32.0" opentelemetry-otlp = "0.32.0" opentelemetry_sdk = "0.32.0" diff --git a/encodings/experimental/onpair/benches/decode.rs b/encodings/experimental/onpair/benches/decode.rs index f22724ed9e2..2f3fce9db23 100644 --- a/encodings/experimental/onpair/benches/decode.rs +++ b/encodings/experimental/onpair/benches/decode.rs @@ -4,11 +4,11 @@ //! Decode-path microbenchmarks for the OnPair Vortex array. //! //! * `decompress_into` — the upstream `onpair::decompress_into` decoder hot -//! loop, fed by a pre-materialised [`OwnedDecodeInputs`]. Measures the -//! inner loop only (no `collect`, no allocation). +//! loop, fed by pre-materialised [`DecodeInputs`]. Measures the inner loop +//! only (no child `execute`, no allocation). //! * `canonicalize_to_varbinview` — the full Vortex -//! `OnPair → VarBinViewArray` path callers actually hit. Includes -//! `OwnedDecodeInputs::collect`, the build_views step, allocation, etc. +//! `OnPair → VarBinViewArray` path callers actually hit. Includes child +//! `execute`, the build_views step, allocation, etc. //! //! Each bench sweeps four corpus shapes against two row counts to surface //! cache-pressure cliffs and per-row decode cost. @@ -24,22 +24,59 @@ clippy::expect_used )] +use std::mem::MaybeUninit; use std::sync::LazyLock; use divan::Bencher; +use onpair::Parts; +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::VortexSessionExecute; +use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::VarBinArray; use vortex_array::arrays::VarBinViewArray; use vortex_array::arrays::filter::FilterKernel; +use vortex_array::builtins::ArrayBuiltins; use vortex_array::dtype::DType; +use vortex_array::dtype::NativePType; use vortex_array::dtype::Nullability; use vortex_array::session::ArraySession; +use vortex_buffer::Buffer; +use vortex_buffer::ByteBuffer; use vortex_mask::Mask; use vortex_onpair::DEFAULT_DICT12_CONFIG; use vortex_onpair::OnPair; use vortex_onpair::OnPairArray; -use vortex_onpair::decode::OwnedDecodeInputs; +use vortex_onpair::OnPairArraySlotsExt; + +/// Host-resident decode inputs, materialised once so the decode-loop benchmark +/// measures only `onpair::decompress_into` (not child `execute`/allocation). +struct DecodeInputs { + dict_bytes: ByteBuffer, + dict_offsets: Buffer, + codes: Buffer, + bits: u32, +} + +impl DecodeInputs { + fn as_parts(&self) -> Parts<'_> { + Parts { + dict_bytes: self.dict_bytes.as_slice(), + dict_offsets: self.dict_offsets.as_slice(), + bits: self.bits, + codes: self.codes.as_slice(), + } + } + + fn decompressed_len(&self) -> usize { + onpair::decompressed_len(self.as_parts()) + } + + fn decompress_into(&self, out: &mut [MaybeUninit]) -> usize { + onpair::decompress_into(self.as_parts(), out) + } +} use vortex_onpair::onpair_compress; use vortex_session::VortexSession; @@ -124,10 +161,24 @@ fn compress(n: usize, shape: Shape) -> OnPairArray { .unwrap_or_else(|e| panic!("onpair_compress failed: {e}")) } -fn materialise(arr: &OnPairArray) -> (OwnedDecodeInputs, usize) { +/// Canonicalise a slot child to the decoder's native primitive width. +fn widen(arr: &ArrayRef, ctx: &mut ExecutionCtx) -> Buffer { + arr.cast(DType::Primitive(T::PTYPE, arr.dtype().nullability())) + .expect("cast") + .execute::(ctx) + .expect("execute") + .into_buffer::() +} + +fn materialise(arr: &OnPairArray) -> (DecodeInputs, usize) { let mut ctx = SESSION.create_execution_ctx(); - let inputs = OwnedDecodeInputs::collect(arr.as_view(), &mut ctx) - .unwrap_or_else(|e| panic!("collect: {e}")); + let view = arr.as_view(); + let inputs = DecodeInputs { + dict_bytes: view.dict_bytes().clone(), + dict_offsets: widen::(view.dict_offsets(), &mut ctx), + codes: widen::(view.codes(), &mut ctx), + bits: view.bits(), + }; let total = inputs.decompressed_len(); (inputs, total) } @@ -140,8 +191,8 @@ const CASES: &[(Shape, usize)] = &[ (Shape::HighCard, 100_000), ]; -/// Raw decode loop time, excluding `OwnedDecodeInputs::collect` and the -/// output allocation. Hits `onpair::decompress_into` directly. +/// Raw decode loop time, excluding child `execute` and the output allocation. +/// Hits `onpair::decompress_into` directly. #[divan::bench(args = CASES)] fn decompress_into_bench(bencher: Bencher, case: (Shape, usize)) { let (shape, n) = case; diff --git a/encodings/experimental/onpair/public-api.lock b/encodings/experimental/onpair/public-api.lock index 247d1352723..4278cf82cac 100644 --- a/encodings/experimental/onpair/public-api.lock +++ b/encodings/experimental/onpair/public-api.lock @@ -1,68 +1,10 @@ pub mod vortex_onpair -pub mod vortex_onpair::decode - -pub struct vortex_onpair::decode::DecodeView<'a> - -pub vortex_onpair::decode::DecodeView::codes: &'a [u16] - -pub vortex_onpair::decode::DecodeView::codes_offsets: &'a [u32] - -pub vortex_onpair::decode::DecodeView::dict_bytes: &'a [u8] - -pub vortex_onpair::decode::DecodeView::dict_table: &'a [u64] - -impl<'a> vortex_onpair::decode::DecodeView<'a> - -pub fn vortex_onpair::decode::DecodeView<'a>::decode_row_into(&self, usize, &mut alloc::vec::Vec) - -pub fn vortex_onpair::decode::DecodeView<'a>::decode_rows_into(&self, usize, usize, &mut alloc::vec::Vec) - -pub unsafe fn vortex_onpair::decode::DecodeView<'a>::decode_rows_into_with_size(&self, usize, usize, usize, &mut alloc::vec::Vec) - -pub unsafe fn vortex_onpair::decode::DecodeView<'a>::decode_rows_unchecked(&self, usize, usize, *mut u8) -> usize - -pub fn vortex_onpair::decode::DecodeView<'a>::decoded_len(&self, usize) -> usize - -pub fn vortex_onpair::decode::DecodeView<'a>::decoded_len_rows(&self, usize, usize) -> usize - -pub fn vortex_onpair::decode::DecodeView<'a>::for_each_dict_slice bool>(&self, usize, F) -> bool - -impl<'a> core::clone::Clone for vortex_onpair::decode::DecodeView<'a> - -pub fn vortex_onpair::decode::DecodeView<'a>::clone(&self) -> vortex_onpair::decode::DecodeView<'a> - -impl<'a> core::marker::Copy for vortex_onpair::decode::DecodeView<'a> - -pub struct vortex_onpair::decode::OwnedDecodeInputs - -pub vortex_onpair::decode::OwnedDecodeInputs::codes: vortex_buffer::buffer::Buffer - -pub vortex_onpair::decode::OwnedDecodeInputs::codes_offsets: vortex_buffer::buffer::Buffer - -pub vortex_onpair::decode::OwnedDecodeInputs::dict_bytes: vortex_buffer::ByteBuffer - -pub vortex_onpair::decode::OwnedDecodeInputs::dict_table: vortex_buffer::buffer::Buffer - -impl vortex_onpair::decode::OwnedDecodeInputs - -pub fn vortex_onpair::decode::OwnedDecodeInputs::collect(vortex_array::array::view::ArrayView<'_, vortex_onpair::OnPair>, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult - -pub fn vortex_onpair::decode::OwnedDecodeInputs::view(&self) -> vortex_onpair::decode::DecodeView<'_> - pub struct vortex_onpair::OnPair impl vortex_onpair::OnPair -pub fn vortex_onpair::OnPair::try_new(vortex_array::dtype::DType, vortex_array::buffer::BufferHandle, vortex_array::array::erased::ArrayRef, vortex_array::array::erased::ArrayRef, vortex_array::array::erased::ArrayRef, vortex_array::array::erased::ArrayRef, vortex_array::validity::Validity, u32) -> vortex_error::VortexResult - -impl core::clone::Clone for vortex_onpair::OnPair - -pub fn vortex_onpair::OnPair::clone(&self) -> vortex_onpair::OnPair - -impl core::fmt::Debug for vortex_onpair::OnPair - -pub fn vortex_onpair::OnPair::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_onpair::OnPair::try_new(dtype: vortex_array::dtype::DType, dict_bytes: vortex_array::buffer::BufferHandle, dict_offsets: vortex_array::array::erased::ArrayRef, codes: vortex_array::array::erased::ArrayRef, codes_offsets: vortex_array::array::erased::ArrayRef, uncompressed_lengths: vortex_array::array::erased::ArrayRef, validity: vortex_array::validity::Validity, bits: u32) -> vortex_error::VortexResult impl vortex_array::array::vtable::VTable for vortex_onpair::OnPair @@ -72,61 +14,49 @@ pub type vortex_onpair::OnPair::TypedArrayData = vortex_onpair::OnPairData pub type vortex_onpair::OnPair::ValidityVTable = vortex_onpair::OnPair -pub fn vortex_onpair::OnPair::append_to_builder(vortex_array::array::view::ArrayView<'_, Self>, &mut dyn vortex_array::builders::ArrayBuilder, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()> +pub fn vortex_onpair::OnPair::append_to_builder(array: vortex_array::array::view::ArrayView<'_, Self>, builder: &mut dyn vortex_array::builders::ArrayBuilder, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()> -pub fn vortex_onpair::OnPair::buffer(vortex_array::array::view::ArrayView<'_, Self>, usize) -> vortex_array::buffer::BufferHandle +pub fn vortex_onpair::OnPair::buffer(array: vortex_array::array::view::ArrayView<'_, Self>, idx: usize) -> vortex_array::buffer::BufferHandle -pub fn vortex_onpair::OnPair::buffer_name(vortex_array::array::view::ArrayView<'_, Self>, usize) -> core::option::Option +pub fn vortex_onpair::OnPair::buffer_name(_array: vortex_array::array::view::ArrayView<'_, Self>, idx: usize) -> core::option::Option -pub fn vortex_onpair::OnPair::deserialize(&self, &vortex_array::dtype::DType, usize, &[u8], &[vortex_array::buffer::BufferHandle], &dyn vortex_array::serde::ArrayChildren, &vortex_session::VortexSession) -> vortex_error::VortexResult> +pub fn vortex_onpair::OnPair::deserialize(&self, dtype: &vortex_array::dtype::DType, len: usize, metadata: &[u8], buffers: &[vortex_array::buffer::BufferHandle], children: &dyn vortex_array::serde::ArrayChildren, _session: &vortex_session::VortexSession) -> vortex_error::VortexResult> -pub fn vortex_onpair::OnPair::execute(vortex_array::array::typed::Array, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult +pub fn vortex_onpair::OnPair::execute(array: vortex_array::array::typed::Array, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult -pub fn vortex_onpair::OnPair::execute_parent(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, usize, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> +pub fn vortex_onpair::OnPair::execute_parent(array: vortex_array::array::view::ArrayView<'_, Self>, parent: &vortex_array::array::erased::ArrayRef, child_idx: usize, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> pub fn vortex_onpair::OnPair::id(&self) -> vortex_array::array::ArrayId -pub fn vortex_onpair::OnPair::nbuffers(vortex_array::array::view::ArrayView<'_, Self>) -> usize +pub fn vortex_onpair::OnPair::nbuffers(_array: vortex_array::array::view::ArrayView<'_, Self>) -> usize -pub fn vortex_onpair::OnPair::reduce_parent(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, usize) -> vortex_error::VortexResult> +pub fn vortex_onpair::OnPair::reduce_parent(array: vortex_array::array::view::ArrayView<'_, Self>, parent: &vortex_array::array::erased::ArrayRef, child_idx: usize) -> vortex_error::VortexResult> -pub fn vortex_onpair::OnPair::serialize(vortex_array::array::view::ArrayView<'_, Self>, &vortex_session::VortexSession) -> vortex_error::VortexResult>> +pub fn vortex_onpair::OnPair::serialize(array: vortex_array::array::view::ArrayView<'_, Self>, _session: &vortex_session::VortexSession) -> vortex_error::VortexResult>> -pub fn vortex_onpair::OnPair::slot_name(vortex_array::array::view::ArrayView<'_, Self>, usize) -> alloc::string::String +pub fn vortex_onpair::OnPair::slot_name(_array: vortex_array::array::view::ArrayView<'_, Self>, idx: usize) -> alloc::string::String -pub fn vortex_onpair::OnPair::validate(&self, &Self::TypedArrayData, &vortex_array::dtype::DType, usize, &[core::option::Option]) -> vortex_error::VortexResult<()> +pub fn vortex_onpair::OnPair::validate(&self, data: &Self::TypedArrayData, dtype: &vortex_array::dtype::DType, len: usize, slots: &[core::option::Option]) -> vortex_error::VortexResult<()> impl vortex_array::array::vtable::operations::OperationsVTable for vortex_onpair::OnPair -pub fn vortex_onpair::OnPair::scalar_at(vortex_array::array::view::ArrayView<'_, vortex_onpair::OnPair>, usize, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult +pub fn vortex_onpair::OnPair::scalar_at(array: vortex_array::array::view::ArrayView<'_, vortex_onpair::OnPair>, index: usize, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult impl vortex_array::array::vtable::validity::ValidityVTable for vortex_onpair::OnPair -pub fn vortex_onpair::OnPair::validity(vortex_array::array::view::ArrayView<'_, vortex_onpair::OnPair>) -> vortex_error::VortexResult +pub fn vortex_onpair::OnPair::validity(array: vortex_array::array::view::ArrayView<'_, vortex_onpair::OnPair>) -> vortex_error::VortexResult impl vortex_array::arrays::filter::kernel::FilterKernel for vortex_onpair::OnPair -pub fn vortex_onpair::OnPair::filter(vortex_array::array::view::ArrayView<'_, Self>, &vortex_mask::Mask, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> +pub fn vortex_onpair::OnPair::filter(array: vortex_array::array::view::ArrayView<'_, Self>, mask: &vortex_mask::Mask, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> impl vortex_array::arrays::slice::SliceReduce for vortex_onpair::OnPair -pub fn vortex_onpair::OnPair::slice(vortex_array::array::view::ArrayView<'_, Self>, core::ops::range::Range) -> vortex_error::VortexResult> - -impl vortex_array::scalar_fn::fns::binary::compare::CompareKernel for vortex_onpair::OnPair - -pub fn vortex_onpair::OnPair::compare(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, vortex_array::scalar_fn::fns::operators::CompareOperator, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> - -impl vortex_array::scalar_fn::fns::cast::kernel::CastKernel for vortex_onpair::OnPair - -pub fn vortex_onpair::OnPair::cast(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::dtype::DType, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> +pub fn vortex_onpair::OnPair::slice(array: vortex_array::array::view::ArrayView<'_, Self>, range: core::ops::range::Range) -> vortex_error::VortexResult> impl vortex_array::scalar_fn::fns::cast::kernel::CastReduce for vortex_onpair::OnPair -pub fn vortex_onpair::OnPair::cast(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::dtype::DType) -> vortex_error::VortexResult> - -impl vortex_array::scalar_fn::fns::like::kernel::LikeKernel for vortex_onpair::OnPair - -pub fn vortex_onpair::OnPair::like(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, vortex_array::scalar_fn::fns::like::LikeOptions, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> +pub fn vortex_onpair::OnPair::cast(array: vortex_array::array::view::ArrayView<'_, Self>, dtype: &vortex_array::dtype::DType) -> vortex_error::VortexResult> pub struct vortex_onpair::OnPairData @@ -142,27 +72,23 @@ pub fn vortex_onpair::OnPairData::is_empty(&self) -> bool pub fn vortex_onpair::OnPairData::len(&self) -> usize -pub fn vortex_onpair::OnPairData::new(vortex_array::buffer::BufferHandle, u32, usize) -> Self - -impl core::clone::Clone for vortex_onpair::OnPairData - -pub fn vortex_onpair::OnPairData::clone(&self) -> vortex_onpair::OnPairData +pub fn vortex_onpair::OnPairData::new(dict_bytes: vortex_array::buffer::BufferHandle, bits: u32, len: usize) -> Self impl core::fmt::Debug for vortex_onpair::OnPairData -pub fn vortex_onpair::OnPairData::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_onpair::OnPairData::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result impl core::fmt::Display for vortex_onpair::OnPairData -pub fn vortex_onpair::OnPairData::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_onpair::OnPairData::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result impl vortex_array::hash::ArrayEq for vortex_onpair::OnPairData -pub fn vortex_onpair::OnPairData::array_eq(&self, &Self, vortex_array::hash::Precision) -> bool +pub fn vortex_onpair::OnPairData::array_eq(&self, other: &Self, precision: vortex_array::hash::Precision) -> bool impl vortex_array::hash::ArrayHash for vortex_onpair::OnPairData -pub fn vortex_onpair::OnPairData::array_hash(&self, &mut H, vortex_array::hash::Precision) +pub fn vortex_onpair::OnPairData::array_hash(&self, state: &mut H, precision: vortex_array::hash::Precision) pub struct vortex_onpair::OnPairMetadata @@ -174,7 +100,7 @@ pub vortex_onpair::OnPairMetadata::codes_ptype: i32 pub vortex_onpair::OnPairMetadata::dict_offsets_ptype: i32 -pub vortex_onpair::OnPairMetadata::dict_size: u64 +pub vortex_onpair::OnPairMetadata::dict_size: u32 pub vortex_onpair::OnPairMetadata::total_tokens: u64 @@ -188,13 +114,13 @@ pub fn vortex_onpair::OnPairMetadata::codes_ptype(&self) -> vortex_array::dtype: pub fn vortex_onpair::OnPairMetadata::dict_offsets_ptype(&self) -> vortex_array::dtype::ptype::PType -pub fn vortex_onpair::OnPairMetadata::set_codes_offsets_ptype(&mut self, vortex_array::dtype::ptype::PType) +pub fn vortex_onpair::OnPairMetadata::set_codes_offsets_ptype(&mut self, value: vortex_array::dtype::ptype::PType) -pub fn vortex_onpair::OnPairMetadata::set_codes_ptype(&mut self, vortex_array::dtype::ptype::PType) +pub fn vortex_onpair::OnPairMetadata::set_codes_ptype(&mut self, value: vortex_array::dtype::ptype::PType) -pub fn vortex_onpair::OnPairMetadata::set_dict_offsets_ptype(&mut self, vortex_array::dtype::ptype::PType) +pub fn vortex_onpair::OnPairMetadata::set_dict_offsets_ptype(&mut self, value: vortex_array::dtype::ptype::PType) -pub fn vortex_onpair::OnPairMetadata::set_uncompressed_lengths_ptype(&mut self, vortex_array::dtype::ptype::PType) +pub fn vortex_onpair::OnPairMetadata::set_uncompressed_lengths_ptype(&mut self, value: vortex_array::dtype::ptype::PType) pub fn vortex_onpair::OnPairMetadata::uncompressed_lengths_ptype(&self) -> vortex_array::dtype::ptype::PType @@ -202,17 +128,13 @@ impl vortex_onpair::OnPairMetadata pub fn vortex_onpair::OnPairMetadata::get_uncompressed_lengths_ptype(&self) -> vortex_error::VortexResult -impl core::clone::Clone for vortex_onpair::OnPairMetadata - -pub fn vortex_onpair::OnPairMetadata::clone(&self) -> vortex_onpair::OnPairMetadata - impl core::default::Default for vortex_onpair::OnPairMetadata pub fn vortex_onpair::OnPairMetadata::default() -> Self impl core::fmt::Debug for vortex_onpair::OnPairMetadata -pub fn vortex_onpair::OnPairMetadata::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_onpair::OnPairMetadata::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result impl prost::message::Message for vortex_onpair::OnPairMetadata @@ -220,40 +142,88 @@ pub fn vortex_onpair::OnPairMetadata::clear(&mut self) pub fn vortex_onpair::OnPairMetadata::encoded_len(&self) -> usize -pub const vortex_onpair::DEFAULT_DICT12_CONFIG: onpair::Config +pub struct vortex_onpair::OnPairSlots + +pub vortex_onpair::OnPairSlots::codes: vortex_array::array::erased::ArrayRef + +pub vortex_onpair::OnPairSlots::codes_offsets: vortex_array::array::erased::ArrayRef + +pub vortex_onpair::OnPairSlots::dict_offsets: vortex_array::array::erased::ArrayRef + +pub vortex_onpair::OnPairSlots::uncompressed_lengths: vortex_array::array::erased::ArrayRef + +pub vortex_onpair::OnPairSlots::validity: core::option::Option + +impl vortex_onpair::OnPairSlots + +pub const vortex_onpair::OnPairSlots::CODES: usize + +pub const vortex_onpair::OnPairSlots::CODES_OFFSETS: usize + +pub const vortex_onpair::OnPairSlots::COUNT: usize + +pub const vortex_onpair::OnPairSlots::DICT_OFFSETS: usize + +pub const vortex_onpair::OnPairSlots::NAMES: [&'static str; 5] + +pub const vortex_onpair::OnPairSlots::UNCOMPRESSED_LENGTHS: usize + +pub const vortex_onpair::OnPairSlots::VALIDITY: usize + +pub fn vortex_onpair::OnPairSlots::from_slots(slots: vortex_array::array::ArraySlots) -> Self + +pub fn vortex_onpair::OnPairSlots::into_slots(self) -> vortex_array::array::ArraySlots + +pub struct vortex_onpair::OnPairSlotsView<'a> + +pub vortex_onpair::OnPairSlotsView::codes: &'a vortex_array::array::erased::ArrayRef + +pub vortex_onpair::OnPairSlotsView::codes_offsets: &'a vortex_array::array::erased::ArrayRef + +pub vortex_onpair::OnPairSlotsView::dict_offsets: &'a vortex_array::array::erased::ArrayRef + +pub vortex_onpair::OnPairSlotsView::uncompressed_lengths: &'a vortex_array::array::erased::ArrayRef + +pub vortex_onpair::OnPairSlotsView::validity: core::option::Option<&'a vortex_array::array::erased::ArrayRef> + +impl<'a> vortex_onpair::OnPairSlotsView<'a> + +pub fn vortex_onpair::OnPairSlotsView<'a>::from_slots(slots: &'a [core::option::Option]) -> Self + +pub fn vortex_onpair::OnPairSlotsView<'a>::to_owned(&self) -> vortex_onpair::OnPairSlots + +pub const vortex_onpair::DEFAULT_DICT12_CONFIG: onpair::config::Config pub const vortex_onpair::MAX_TOKEN_SIZE: usize -pub trait vortex_onpair::OnPairArrayExt: vortex_array::array::typed::TypedArrayRef +pub trait vortex_onpair::OnPairArrayExt: vortex_onpair::OnPairArraySlotsExt pub fn vortex_onpair::OnPairArrayExt::array_validity(&self) -> vortex_array::validity::Validity -pub fn vortex_onpair::OnPairArrayExt::codes(&self) -> &vortex_array::array::erased::ArrayRef - -pub fn vortex_onpair::OnPairArrayExt::codes_offsets(&self) -> &vortex_array::array::erased::ArrayRef +impl vortex_onpair::OnPairArrayExt for T -pub fn vortex_onpair::OnPairArrayExt::dict_offsets(&self) -> &vortex_array::array::erased::ArrayRef +pub trait vortex_onpair::OnPairArraySlotsExt: vortex_array::array::typed::TypedArrayRef -pub fn vortex_onpair::OnPairArrayExt::uncompressed_lengths(&self) -> &vortex_array::array::erased::ArrayRef +pub fn vortex_onpair::OnPairArraySlotsExt::codes(&self) -> &vortex_array::array::erased::ArrayRef -impl> vortex_onpair::OnPairArrayExt for T +pub fn vortex_onpair::OnPairArraySlotsExt::codes_offsets(&self) -> &vortex_array::array::erased::ArrayRef -pub fn T::array_validity(&self) -> vortex_array::validity::Validity +pub fn vortex_onpair::OnPairArraySlotsExt::dict_offsets(&self) -> &vortex_array::array::erased::ArrayRef -pub fn T::codes(&self) -> &vortex_array::array::erased::ArrayRef +pub fn vortex_onpair::OnPairArraySlotsExt::slots_view(&self) -> vortex_onpair::OnPairSlotsView<'_> -pub fn T::codes_offsets(&self) -> &vortex_array::array::erased::ArrayRef +pub fn vortex_onpair::OnPairArraySlotsExt::uncompressed_lengths(&self) -> &vortex_array::array::erased::ArrayRef -pub fn T::dict_offsets(&self) -> &vortex_array::array::erased::ArrayRef +pub fn vortex_onpair::OnPairArraySlotsExt::validity(&self) -> core::option::Option<&vortex_array::array::erased::ArrayRef> -pub fn T::uncompressed_lengths(&self) -> &vortex_array::array::erased::ArrayRef +impl> vortex_onpair::OnPairArraySlotsExt for T -pub fn vortex_onpair::onpair_compress>(A, usize, &vortex_array::dtype::DType, onpair::Config) -> vortex_error::VortexResult +pub fn vortex_onpair::onpair_compress>(array: A, len: usize, dtype: &vortex_array::dtype::DType, config: onpair::config::Config) -> vortex_error::VortexResult -pub fn vortex_onpair::onpair_compress_array(&vortex_array::array::erased::ArrayRef, onpair::Config, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult +pub fn vortex_onpair::onpair_compress_array(array: &vortex_array::array::erased::ArrayRef, config: onpair::config::Config, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult -pub fn vortex_onpair::onpair_compress_array_default(&vortex_array::array::erased::ArrayRef, onpair::Config) -> vortex_error::VortexResult +pub fn vortex_onpair::onpair_compress_array_default(array: &vortex_array::array::erased::ArrayRef, config: onpair::config::Config) -> vortex_error::VortexResult -pub fn vortex_onpair::onpair_compress_iter<'a, I>(I, usize, vortex_array::dtype::DType, onpair::Config) -> vortex_error::VortexResult where I: core::iter::traits::iterator::Iterator> +pub fn vortex_onpair::onpair_compress_iter<'a, I>(iter: I, len: usize, dtype: vortex_array::dtype::DType, config: onpair::config::Config) -> vortex_error::VortexResult where I: core::iter::traits::iterator::Iterator> pub type vortex_onpair::OnPairArray = vortex_array::array::typed::Array diff --git a/encodings/experimental/onpair/src/canonical.rs b/encodings/experimental/onpair/src/canonical.rs index 5700bd2cf8f..c658b841155 100644 --- a/encodings/experimental/onpair/src/canonical.rs +++ b/encodings/experimental/onpair/src/canonical.rs @@ -7,6 +7,7 @@ use std::sync::Arc; use num_traits::AsPrimitive; +use onpair::Parts; use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; @@ -25,7 +26,8 @@ use vortex_error::vortex_ensure; use crate::OnPair; use crate::OnPairArraySlotsExt; -use crate::decode::OwnedDecodeInputs; +use crate::decode::code_boundary_at; +use crate::decode::collect_widened; pub(super) fn canonicalize_onpair( array: ArrayView<'_, OnPair>, @@ -49,8 +51,6 @@ pub(crate) fn onpair_decode_views( .clone() .execute::(ctx)?; - let inputs = OwnedDecodeInputs::collect(array, ctx)?; - let total_size: usize = match_each_integer_ptype!(lengths.ptype(), |P| { lengths .as_slice::

() @@ -59,22 +59,46 @@ pub(crate) fn onpair_decode_views( .sum() }); - let code_start = inputs.code_boundaries.first().copied().unwrap_or_default() as usize; - let code_end = inputs.code_boundaries.last().copied().unwrap_or_default() as usize; + // `codes_offsets` holds the per-row code boundaries and may itself be a + // sliced or filtered view of the original. Its first and last entries + // bound the contiguous run of `codes` belonging to the rows present in + // this array: `slice` keeps the full `codes` child and only narrows + // `codes_offsets` (so `code_start > 0` and/or `code_end < codes.len()`), + // while `filter` rebuilds both children so the window is the whole stream. + // OnPair has no `TakeExecute`, so a reordering take is served from the + // canonical `VarBinView` and never reaches this path. We only need those + // two boundaries, so point-look them up rather than decoding every offset. + let codes_offsets = array.codes_offsets(); + let code_start = code_boundary_at(codes_offsets, 0, ctx)?; + let code_end = code_boundary_at(codes_offsets, array.len(), ctx)?; vortex_ensure!( code_start <= code_end, "OnPair codes_offsets must be nondecreasing" ); vortex_ensure!( - code_end <= inputs.codes.len(), + code_end <= array.codes().len(), "OnPair codes_offsets end {} exceeds codes len {}", code_end, - inputs.codes.len() + array.codes().len() ); + // Slice the `codes` child to that window *before* unpacking it, so a sliced + // array materialises only its own codes rather than the whole column's. The + // contiguous decoder walks `codes` in order and never reads the per-row + // boundaries, so an empty boundary slice is sound. + let codes = collect_widened::(&array.codes().slice(code_start..code_end)?, ctx)?; + let dict_offsets = collect_widened::(array.dict_offsets(), ctx)?; + let mut out_bytes = ByteBufferMut::with_capacity(total_size); - let written = - inputs.decompress_code_range_into(code_start..code_end, out_bytes.spare_capacity_mut()); + let written = onpair::decompress_into( + Parts { + dict_bytes: array.dict_bytes().as_slice(), + dict_offsets: dict_offsets.as_slice(), + bits: array.bits(), + codes: codes.as_slice(), + }, + out_bytes.spare_capacity_mut(), + ); debug_assert_eq!(written, total_size); // SAFETY: `decompress_into` initialised exactly `written` bytes of the // spare capacity reserved above. diff --git a/encodings/experimental/onpair/src/compress.rs b/encodings/experimental/onpair/src/compress.rs index bb3b6f95e88..4c3de89f316 100644 --- a/encodings/experimental/onpair/src/compress.rs +++ b/encodings/experimental/onpair/src/compress.rs @@ -65,7 +65,8 @@ where let column = onpair::compress(&flat, &offsets, config) .map_err(|e| vortex_err!("OnPair compress failed: {e}"))?; - let (bits, dict_bytes, dict_offsets, codes, codes_offsets) = parts_to_children(&column)?; + let (bits, dict_bytes, dict_offsets, codes, codes_offsets) = + parts_to_children(&column, &offsets)?; let uncompressed_lengths = uncompressed_lengths.into_array(); let validity = match dtype.nullability() { @@ -87,8 +88,13 @@ where /// Lift a compressed [`Column`] into Vortex children + the dict buffer. /// Returns `(bits, dict_bytes_buffer, dict_offsets_child, codes_child, codes_offsets_child)`. +/// +/// `row_byte_offsets` are the per-row decoded byte boundaries that were handed +/// to [`onpair::compress`]; they let us recover the per-row *code* boundaries +/// (the `codes_offsets` child), which the `onpair` crate no longer returns. fn parts_to_children( - column: &Column, + column: &Column, + row_byte_offsets: &[u64], ) -> VortexResult<(u32, BufferHandle, ArrayRef, ArrayRef, ArrayRef)> { let bits = column.bits; // Pad the dictionary blob with MAX_TOKEN_SIZE zero bytes so the @@ -106,23 +112,49 @@ fn parts_to_children( let dict_bytes = BufferHandle::new_host(ByteBuffer::from(padded).aligned(vortex_buffer::Alignment::new(8))); - let dict_offsets = Buffer::::copy_from(column.dict_offsets.as_slice()).into_array(); // The crate emits already-unpacked token codes (one `u16` per token), so // they map straight onto the `codes` slot child. let codes = Buffer::::copy_from(column.codes.as_slice()).into_array(); - // Per-row boundaries are `u64`; the array stores them as `u32`. Token - // counts comfortably fit `u32` for any single chunk. - let codes_offsets: Vec = column - .code_boundaries - .iter() - .map(|&b| { - u32::try_from(b).map_err(|_| vortex_err!("OnPair: code boundary {b} does not fit u32")) - }) - .collect::>()?; + // Recover the per-row code boundaries. `onpair::compress` no longer returns + // them, but its tokenizer never lets a token span a row boundary, so a + // row's codes decode to exactly its byte span. Walk `codes`, summing each + // token's byte length (`dict_offsets[c+1] - dict_offsets[c]`), and close a + // row when the accumulated decoded length reaches that row's byte offset. + let codes_offsets = build_codes_offsets(&column.codes, &column.dict_offsets, row_byte_offsets)?; + + let dict_offsets = Buffer::::copy_from(column.dict_offsets.as_slice()).into_array(); let codes_offsets = Buffer::::copy_from(codes_offsets).into_array(); Ok((bits, dict_bytes, dict_offsets, codes, codes_offsets)) } +/// Reconstruct the per-row `codes_offsets` from the flat `codes`, the +/// dictionary `dict_offsets` (token byte lengths) and the per-row decoded byte +/// boundaries. Returns `nrows + 1` cumulative code counts (`u32`). +fn build_codes_offsets( + codes: &[u16], + dict_offsets: &[u32], + row_byte_offsets: &[u64], +) -> VortexResult> { + let nrows = row_byte_offsets.len() - 1; + let mut codes_offsets = Vec::with_capacity(nrows + 1); + codes_offsets.push(0u32); + let mut decoded_bytes: u64 = 0; + let mut code_idx: usize = 0; + for r in 0..nrows { + let target = row_byte_offsets[r + 1]; + while decoded_bytes < target { + let code = codes[code_idx] as usize; + decoded_bytes += u64::from(dict_offsets[code + 1] - dict_offsets[code]); + code_idx += 1; + } + codes_offsets.push( + u32::try_from(code_idx) + .map_err(|_| vortex_err!("OnPair: code boundary {code_idx} does not fit u32"))?, + ); + } + Ok(codes_offsets) +} + /// Compress a byte-string accessor (typically a `VarBinArray` or /// `VarBinViewArray`). pub fn onpair_compress>( diff --git a/encodings/experimental/onpair/src/decode.rs b/encodings/experimental/onpair/src/decode.rs index 437e09f8f08..e3c7346f0d9 100644 --- a/encodings/experimental/onpair/src/decode.rs +++ b/encodings/experimental/onpair/src/decode.rs @@ -1,38 +1,21 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors // -//! Bridge between [`OnPair`] slot children and the upstream `onpair` crate's -//! decompression API. +//! Helpers for turning [`OnPair`] slot children into the inputs the upstream +//! `onpair` decoder consumes. -use std::mem::MaybeUninit; -use std::ops::Range; - -use onpair::Parts; use vortex_array::ArrayRef; -use vortex_array::ArrayView; use vortex_array::ExecutionCtx; use vortex_array::arrays::PrimitiveArray; use vortex_array::builtins::ArrayBuiltins; use vortex_array::dtype::DType; use vortex_array::dtype::NativePType; use vortex_buffer::Buffer; -use vortex_buffer::ByteBuffer; use vortex_error::VortexResult; - -use crate::OnPair; -use crate::OnPairArraySlotsExt; - -/// Materialised, host-resident copies of every read path's input. -pub struct OwnedDecodeInputs { - pub dict_bytes: ByteBuffer, - pub dict_offsets: Buffer, - pub codes: Buffer, - pub code_boundaries: Buffer, - pub bits: u32, -} +use vortex_error::vortex_err; /// Canonicalise a slot child to the decoder's native primitive width. -fn collect_widened( +pub(crate) fn collect_widened( arr: &ArrayRef, ctx: &mut ExecutionCtx, ) -> VortexResult> { @@ -43,70 +26,18 @@ fn collect_widened( .into_buffer::()) } -impl OwnedDecodeInputs { - pub fn collect(array: ArrayView<'_, OnPair>, ctx: &mut ExecutionCtx) -> VortexResult { - Ok(Self { - dict_bytes: array.dict_bytes().clone(), - dict_offsets: collect_widened::(array.dict_offsets(), ctx)?, - codes: collect_widened::(array.codes(), ctx)?, - code_boundaries: collect_widened::(array.codes_offsets(), ctx)?, - bits: array.bits(), - }) - } - - /// Total decoded byte length across all rows. - #[inline] - pub fn decompressed_len(&self) -> usize { - onpair::decompressed_len(self.as_parts()) - } - - /// Decoded byte length of a single row. - #[inline] - pub fn decompressed_row_len(&self, row: usize) -> usize { - onpair::decompressed_row_len(self.as_parts(), row) - } - - /// Decode every row contiguously into `out`. Returns the number of - /// initialised bytes. - #[inline] - pub fn decompress_into(&self, out: &mut [MaybeUninit]) -> usize { - onpair::decompress_into(self.as_parts(), out) - } - - /// Decode a contiguous code window into `out`. Returns the number of - /// initialised bytes. - #[inline] - pub fn decompress_code_range_into( - &self, - range: Range, - out: &mut [MaybeUninit], - ) -> usize { - onpair::decompress_into( - Parts:: { - dict_bytes: self.dict_bytes.as_slice(), - dict_offsets: self.dict_offsets.as_slice(), - bits: self.bits, - codes: &self.codes.as_slice()[range], - code_boundaries: &[], - }, - out, - ) - } - - /// Decode a single row into `out`. Returns the number of initialised - /// bytes. - #[inline] - pub fn decompress_row_into(&self, row: usize, out: &mut [MaybeUninit]) -> usize { - onpair::decompress_row_into(self.as_parts(), row, out) - } - - fn as_parts(&self) -> Parts<'_, u32> { - Parts { - dict_bytes: self.dict_bytes.as_slice(), - dict_offsets: self.dict_offsets.as_slice(), - bits: self.bits, - codes: self.codes.as_slice(), - code_boundaries: self.code_boundaries.as_slice(), - } - } +/// Read one `codes_offsets` boundary by point lookup. This decodes at most a +/// single chunk of the child — never the whole per-row offsets array — so the +/// callers that only need a row window (`scalar_at`, the canonical decode's +/// start/end bounds) don't pay to materialise every boundary. +pub(crate) fn code_boundary_at( + codes_offsets: &ArrayRef, + index: usize, + ctx: &mut ExecutionCtx, +) -> VortexResult { + codes_offsets + .execute_scalar(index, ctx)? + .as_primitive() + .as_::() + .ok_or_else(|| vortex_err!("OnPair codes_offsets[{index}] is null")) } diff --git a/encodings/experimental/onpair/src/lib.rs b/encodings/experimental/onpair/src/lib.rs index b07ee395bf9..719d5b0eab2 100644 --- a/encodings/experimental/onpair/src/lib.rs +++ b/encodings/experimental/onpair/src/lib.rs @@ -14,7 +14,7 @@ mod array; mod canonical; mod compress; mod compute; -pub mod decode; +mod decode; mod kernel; mod ops; mod rules; diff --git a/encodings/experimental/onpair/src/ops.rs b/encodings/experimental/onpair/src/ops.rs index e57ce3a5934..2272f6f5fca 100644 --- a/encodings/experimental/onpair/src/ops.rs +++ b/encodings/experimental/onpair/src/ops.rs @@ -1,6 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use onpair::Parts; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; use vortex_array::arrays::varbin::varbin_scalar; @@ -10,7 +11,9 @@ use vortex_buffer::ByteBuffer; use vortex_error::VortexResult; use crate::OnPair; -use crate::decode::OwnedDecodeInputs; +use crate::OnPairArraySlotsExt; +use crate::decode::code_boundary_at; +use crate::decode::collect_widened; impl OperationsVTable for OnPair { fn scalar_at( @@ -18,13 +21,30 @@ impl OperationsVTable for OnPair { index: usize, ctx: &mut ExecutionCtx, ) -> VortexResult { - let inputs = OwnedDecodeInputs::collect(array, ctx)?; - let len = inputs.decompressed_row_len(index); + // A row owns a variable-length run of the flat `codes` stream; the + // per-row `codes_offsets` boundaries map the row index to that run. + // Read just this row's two boundaries (point lookups that decode at + // most one chunk of `codes_offsets`) and decode only that run — never + // the whole column. + let codes_offsets = array.codes_offsets(); + let row_start = code_boundary_at(codes_offsets, index, ctx)?; + let row_end = code_boundary_at(codes_offsets, index + 1, ctx)?; + + let codes = collect_widened::(&array.codes().slice(row_start..row_end)?, ctx)?; + let dict_offsets = collect_widened::(array.dict_offsets(), ctx)?; + let parts = Parts { + dict_bytes: array.dict_bytes().as_slice(), + dict_offsets: dict_offsets.as_slice(), + bits: array.bits(), + codes: codes.as_slice(), + }; + + let len = onpair::decompressed_len(parts); let mut buf: Vec = Vec::with_capacity(len); - let written = inputs.decompress_row_into(index, buf.spare_capacity_mut()); + let written = onpair::decompress_into(parts, buf.spare_capacity_mut()); debug_assert_eq!(written, len); - // SAFETY: `decompress_row_into` initialised `written` bytes of the - // spare capacity reserved above. + // SAFETY: `decompress_into` initialised `written` bytes of the spare + // capacity reserved above. unsafe { buf.set_len(written) }; Ok(varbin_scalar(ByteBuffer::from(buf), array.dtype())) } diff --git a/encodings/experimental/onpair/src/tests.rs b/encodings/experimental/onpair/src/tests.rs index e1cec96a1c2..dd6fe4b0116 100644 --- a/encodings/experimental/onpair/src/tests.rs +++ b/encodings/experimental/onpair/src/tests.rs @@ -134,6 +134,52 @@ fn test_onpair_scalar_at() { assert_eq!(v.as_bytes(), b"https://www.test.org/page"); } +/// `scalar_at` must decode only the requested row's code window — fetching +/// its two `codes_offsets` boundaries via point lookup, not by materialising +/// the whole `codes_offsets`/`codes` children. Verify correctness at several +/// indices (including the last row) on a full array, and on a *sliced* array +/// where `codes_offsets` is itself a narrowed view and the row index is +/// relative to the slice. +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_scalar_at_window() -> vortex_error::VortexResult<()> { + let n = 2_000usize; + let strings: Vec = (0..n) + .map(|i| format!("https://www.example.com/items/{i:08}/page?q={i}")) + .collect(); + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_bytes())), + DType::Utf8(Nullability::NonNullable), + ); + let arr = + onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG)?.into_array(); + + let mut ctx = SESSION.create_execution_ctx(); + for &i in &[0usize, 1, 999, 1000, n - 1] { + let got = arr.execute_scalar(i, &mut ctx)?; + assert_eq!( + got.as_utf8().value().unwrap().as_bytes(), + strings[i].as_bytes(), + "full array row {i}" + ); + } + + // Sliced array: `codes_offsets` is narrowed (first boundary > 0), so the + // point lookup must resolve indices relative to the slice. + let (start, end) = (700usize, 1300usize); + let sliced = arr.slice(start..end)?; + assert!(sliced.is::(), "slice dropped OnPair encoding"); + for &j in &[0usize, 1, 300, end - start - 1] { + let got = sliced.execute_scalar(j, &mut ctx)?; + assert_eq!( + got.as_utf8().value().unwrap().as_bytes(), + strings[start + j].as_bytes(), + "sliced row {j}" + ); + } + Ok(()) +} + /// The hot decode loop is 4×-unrolled with a scalar tail. Anything that /// lands in the tail (1-3 leftover tokens, or zero total tokens) must /// produce the same bytes as the unrolled body. Hit every row-count @@ -386,3 +432,54 @@ fn test_onpair_filter_with_narrowed_codes_offsets_u8() { .expect("OnPair filter must return Some"); assert_eq!(filtered.len(), n / 2); } + +/// Regression: canonicalising a *sliced* OnPair array. `slice` keeps the full +/// `codes` child and only narrows `codes_offsets`, so a sliced array has a +/// non-contiguous code window (`code_start > 0` and/or `code_end < +/// codes.len()`). `onpair_decode_views` must decode exactly that window; +/// decoding the whole `codes` stream — as a boundary-agnostic whole-column +/// decoder would — yields the wrong bytes (and over-runs the output) for any +/// partial slice. `filter` never produces this shape (it rebuilds `codes` +/// contiguously), so the existing filter tests do not cover it. +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_slice_canonicalize() -> vortex_error::VortexResult<()> { + let n = 5_000usize; + let strings: Vec = (0..n) + .map(|i| format!("https://www.example.com/items/{i:08}")) + .collect(); + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_bytes())), + DType::Utf8(Nullability::NonNullable), + ); + let arr = + onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG)?.into_array(); + + // interior (start>0, end0, + // end=n), and a near-full window. + for (start, end) in [(1234usize, 1240usize), (0, 7), (4993, n), (1, n - 1)] { + let sliced = arr.clone().slice(start..end)?; + assert_eq!(sliced.len(), end - start); + assert!( + sliced.is::(), + "slice dropped OnPair encoding: got {}", + sliced.encoding_id() + ); + + let mut ctx = SESSION.create_execution_ctx(); + let canonical = sliced.execute::(&mut ctx)?; + canonical.with_iterator(|iter| { + let got: Vec>> = iter.map(|b| b.map(|s| s.to_vec())).collect(); + assert_eq!(got.len(), end - start, "window {start}..{end} length"); + for (i, want) in strings[start..end].iter().enumerate() { + assert_eq!( + got[i].as_deref(), + Some(want.as_bytes()), + "window {start}..{end} row {i}" + ); + } + Ok::<_, vortex_error::VortexError>(()) + })?; + } + Ok(()) +} From 69d801e89ce490b8ef152ec2e4709c82663f3329 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 20:51:49 +0100 Subject: [PATCH 19/27] fix Signed-off-by: Joe Isaacs --- Cargo.lock | 736 ++++++++++++++++++----------------------------------- 1 file changed, 248 insertions(+), 488 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 595f115f820..5c8fc94c8dc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -228,57 +228,25 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" -[[package]] -name = "arrow" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bd47f2a6ddc39244bd722a27ee5da66c03369d087b9e024eafdb03e98b98ea7" -dependencies = [ - "arrow-arith 57.3.1", - "arrow-array 57.3.1", - "arrow-buffer 57.3.1", - "arrow-cast 57.3.1", - "arrow-data 57.3.1", - "arrow-ord 57.3.1", - "arrow-row 57.3.1", - "arrow-schema 57.3.1", - "arrow-select 57.3.1", - "arrow-string 57.3.1", -] - [[package]] name = "arrow" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "378530e55cd479eda3c14eb345310799717e6f76d0c332041e8487022166b471" dependencies = [ - "arrow-arith 58.3.0", - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-cast 58.3.0", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", "arrow-csv", - "arrow-data 58.3.0", - "arrow-ipc 58.3.0", + "arrow-data", + "arrow-ipc", "arrow-json", - "arrow-ord 58.3.0", - "arrow-row 58.3.0", - "arrow-schema 58.3.0", - "arrow-select 58.3.0", - "arrow-string 58.3.0", -] - -[[package]] -name = "arrow-arith" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c7bbd679c5418b8639b92be01f361d60013c4906574b578b77b63c78356594c" -dependencies = [ - "arrow-array 57.3.1", - "arrow-buffer 57.3.1", - "arrow-data 57.3.1", - "arrow-schema 57.3.1", - "chrono", - "num-traits", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", ] [[package]] @@ -287,32 +255,14 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a0ab212d2c1886e802f51c5212d78ebbcbb0bec980fff9dadc1eb8d45cd0b738" dependencies = [ - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-data 58.3.0", - "arrow-schema 58.3.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "chrono", "num-traits", ] -[[package]] -name = "arrow-array" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8a4ab47b3f3eac60f7fd31b81e9028fda018607bcc63451aca4f2b755269862" -dependencies = [ - "ahash 0.8.12", - "arrow-buffer 57.3.1", - "arrow-data 57.3.1", - "arrow-schema 57.3.1", - "chrono", - "half", - "hashbrown 0.16.1", - "num-complex", - "num-integer", - "num-traits", -] - [[package]] name = "arrow-array" version = "58.3.0" @@ -320,9 +270,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfd33d3e92f207444098c75b42de99d329562be0cf686b307b097cc52b4e999e" dependencies = [ "ahash 0.8.12", - "arrow-buffer 58.3.0", - "arrow-data 58.3.0", - "arrow-schema 58.3.0", + "arrow-buffer", + "arrow-data", + "arrow-schema", "chrono", "chrono-tz", "half", @@ -332,18 +282,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "arrow-buffer" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d18b89b4c4f4811d0858175e79541fe98e33e18db3b011708bc287b1240593f" -dependencies = [ - "bytes", - "half", - "num-bigint", - "num-traits", -] - [[package]] name = "arrow-buffer" version = "58.3.0" @@ -356,40 +294,18 @@ dependencies = [ "num-traits", ] -[[package]] -name = "arrow-cast" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "722b5c41dd1d14d0a879a1bce92c6fe33f546101bb2acce57a209825edd075b3" -dependencies = [ - "arrow-array 57.3.1", - "arrow-buffer 57.3.1", - "arrow-data 57.3.1", - "arrow-ord 57.3.1", - "arrow-schema 57.3.1", - "arrow-select 57.3.1", - "atoi", - "base64", - "chrono", - "comfy-table", - "half", - "lexical-core", - "num-traits", - "ryu", -] - [[package]] name = "arrow-cast" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c5aefb56a2c02e9e2b30746241058b85f8983f0fcff2ba0c6d09006e1cded7f" dependencies = [ - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-data 58.3.0", - "arrow-ord 58.3.0", - "arrow-schema 58.3.0", - "arrow-select 58.3.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-ord", + "arrow-schema", + "arrow-select", "atoi", "base64", "chrono", @@ -406,68 +322,41 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e94e8cf7e517657a52b91ea1263acf38c4ca62a84655d72458a3359b12ab97de" dependencies = [ - "arrow-array 58.3.0", - "arrow-cast 58.3.0", - "arrow-schema 58.3.0", + "arrow-array", + "arrow-cast", + "arrow-schema", "chrono", "csv", "csv-core", "regex", ] -[[package]] -name = "arrow-data" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1683705c63dcf0d18972759eda48489028cbbff67af7d6bef2c6b7b74ab778a" -dependencies = [ - "arrow-buffer 57.3.1", - "arrow-schema 57.3.1", - "half", - "num-integer", - "num-traits", -] - [[package]] name = "arrow-data" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c88210023a2bfee1896af366309a3028fc3bcbd6515fa29a7990ee1baa08ee0" dependencies = [ - "arrow-buffer 58.3.0", - "arrow-schema 58.3.0", + "arrow-buffer", + "arrow-schema", "half", "num-integer", "num-traits", ] -[[package]] -name = "arrow-ipc" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cf72d04c07229fbf4dbebe7145cac37d7cf7ec582fe705c6b92cb314af096ab" -dependencies = [ - "arrow-array 57.3.1", - "arrow-buffer 57.3.1", - "arrow-data 57.3.1", - "arrow-schema 57.3.1", - "arrow-select 57.3.1", - "flatbuffers", -] - [[package]] name = "arrow-ipc" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "238438f0834483703d88896db6fe5a7138b2230debc31b34c0336c2996e3c64f" dependencies = [ - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-data 58.3.0", - "arrow-schema 58.3.0", - "arrow-select 58.3.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", "flatbuffers", - "lz4_flex 0.13.1", + "lz4_flex", "zstd", ] @@ -477,12 +366,12 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "205ca2119e6d679d5c133c6f30e68f027738d95ed948cf77677ea69c7800036b" dependencies = [ - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-cast 58.3.0", - "arrow-ord 58.3.0", - "arrow-schema 58.3.0", - "arrow-select 58.3.0", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-ord", + "arrow-schema", + "arrow-select", "chrono", "half", "indexmap 2.14.0", @@ -496,43 +385,17 @@ dependencies = [ "simdutf8", ] -[[package]] -name = "arrow-ord" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "082342947d4e5a2bcccf029a0a0397e21cb3bb8421edd9571d34fb5dd2670256" -dependencies = [ - "arrow-array 57.3.1", - "arrow-buffer 57.3.1", - "arrow-data 57.3.1", - "arrow-schema 57.3.1", - "arrow-select 57.3.1", -] - [[package]] name = "arrow-ord" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1bffd8fd2579286a5d63bac898159873e5094a79009940bcb42bbfce4f19f1d0" dependencies = [ - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-data 58.3.0", - "arrow-schema 58.3.0", - "arrow-select 58.3.0", -] - -[[package]] -name = "arrow-row" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3a931b520a2a5e22033e01a6f2486b4cdc26f9106b759abeebc320f125e94d7" -dependencies = [ - "arrow-array 57.3.1", - "arrow-buffer 57.3.1", - "arrow-data 57.3.1", - "arrow-schema 57.3.1", - "half", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", ] [[package]] @@ -541,19 +404,13 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bab5994731204603c73ba69267616c50f80780774c6bb0476f1f830625115e0c" dependencies = [ - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-data 58.3.0", - "arrow-schema 58.3.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "half", ] -[[package]] -name = "arrow-schema" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4cf0d4a6609679e03002167a61074a21d7b1ad9ea65e462b2c0a97f8a3b2bc6" - [[package]] name = "arrow-schema" version = "58.3.0" @@ -565,20 +422,6 @@ dependencies = [ "serde_json", ] -[[package]] -name = "arrow-select" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b320d86a9806923663bb0fd9baa65ecaba81cb0cd77ff8c1768b9716b4ef891" -dependencies = [ - "ahash 0.8.12", - "arrow-array 57.3.1", - "arrow-buffer 57.3.1", - "arrow-data 57.3.1", - "arrow-schema 57.3.1", - "num-traits", -] - [[package]] name = "arrow-select" version = "58.3.0" @@ -586,41 +429,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8cd065c54172ac787cf3f2f8d4107e0d3fdc26edba76fdf4f4cc170258942222" dependencies = [ "ahash 0.8.12", - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-data 58.3.0", - "arrow-schema 58.3.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "num-traits", ] -[[package]] -name = "arrow-string" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b493e99162e5764077e7823e50ba284858d365922631c7aaefe9487b1abd02c2" -dependencies = [ - "arrow-array 57.3.1", - "arrow-buffer 57.3.1", - "arrow-data 57.3.1", - "arrow-schema 57.3.1", - "arrow-select 57.3.1", - "memchr", - "num-traits", - "regex", - "regex-syntax", -] - [[package]] name = "arrow-string" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "29dd7cda3ab9692f43a2e4acc444d760cc17b12bb6d8232ddf64e9bab7c06b42" dependencies = [ - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-data 58.3.0", - "arrow-schema 58.3.0", - "arrow-select 58.3.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", "memchr", "num-traits", "regex", @@ -1637,8 +1463,8 @@ name = "compress-bench" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array 58.3.0", - "arrow-schema 58.3.0", + "arrow-array", + "arrow-schema", "async-trait", "bytes", "clap", @@ -1646,7 +1472,7 @@ dependencies = [ "indicatif", "itertools 0.14.0", "lance-bench", - "parquet 58.3.0", + "parquet", "regex", "tokio", "tracing", @@ -2149,8 +1975,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93db0e623840612f7f2cd757f7e8a8922064192363732c88692e0870016e141b" dependencies = [ - "arrow 58.3.0", - "arrow-schema 58.3.0", + "arrow", + "arrow-schema", "async-trait", "bytes", "bzip2", @@ -2188,7 +2014,7 @@ dependencies = [ "log", "object_store 0.13.2", "parking_lot", - "parquet 58.3.0", + "parquet", "rand 0.9.4", "regex", "sqlparser", @@ -2231,7 +2057,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37cefde60b26a7f4ff61e9d2ff2833322f91df2b568d7238afe67bde5bdffb66" dependencies = [ - "arrow 58.3.0", + "arrow", "async-trait", "dashmap", "datafusion-common", @@ -2256,7 +2082,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "17e112307715d6a7a331111a4c2330ff54bc237183511c319e3708a4cff431fb" dependencies = [ - "arrow 58.3.0", + "arrow", "async-trait", "datafusion-catalog", "datafusion-common", @@ -2281,8 +2107,8 @@ checksum = "d72a11ca44a95e1081870d3abb80c717496e8a7acb467a1d3e932bb636af5cc2" dependencies = [ "ahash 0.8.12", "apache-avro", - "arrow 58.3.0", - "arrow-ipc 58.3.0", + "arrow", + "arrow-ipc", "chrono", "half", "hashbrown 0.16.1", @@ -2291,7 +2117,7 @@ dependencies = [ "libc", "log", "object_store 0.13.2", - "parquet 58.3.0", + "parquet", "paste", "recursive", "sqlparser", @@ -2316,7 +2142,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" dependencies = [ - "arrow 58.3.0", + "arrow", "async-compression", "async-trait", "bytes", @@ -2351,8 +2177,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffa6c52cfed0734c5f93754d1c0175f558175248bf686c944fb05c373e5fc096" dependencies = [ - "arrow 58.3.0", - "arrow-ipc 58.3.0", + "arrow", + "arrow-ipc", "async-trait", "bytes", "datafusion-common", @@ -2376,7 +2202,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a579c3bd290c66ea4b269493e75e8a3ed42c9c895a651f10210a29538aee50c4" dependencies = [ "apache-avro", - "arrow 58.3.0", + "arrow", "async-trait", "bytes", "datafusion-common", @@ -2395,7 +2221,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "503f29e0582c1fc189578d665ff57d9300da1f80c282777d7eb67bb79fb8cdca" dependencies = [ - "arrow 58.3.0", + "arrow", "async-trait", "bytes", "datafusion-common", @@ -2418,7 +2244,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e33804749abc8d0c8cb7473228483cb8070e524c6f6086ee1b85a64debe2b3d2" dependencies = [ - "arrow 58.3.0", + "arrow", "async-trait", "bytes", "datafusion-common", @@ -2442,7 +2268,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997" dependencies = [ - "arrow 58.3.0", + "arrow", "async-trait", "bytes", "datafusion-common", @@ -2462,7 +2288,7 @@ dependencies = [ "log", "object_store 0.13.2", "parking_lot", - "parquet 58.3.0", + "parquet", "tokio", ] @@ -2478,8 +2304,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c03c7fbdaefcca4ef6ffe425a5fc2325763bfb426599bb0bf4536466efabe709" dependencies = [ - "arrow 58.3.0", - "arrow-buffer 58.3.0", + "arrow", + "arrow-buffer", "async-trait", "chrono", "dashmap", @@ -2501,7 +2327,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "574b9b6977fedbd2a611cbff12e5caf90f31640ad9dc5870f152836d94bad0dd" dependencies = [ - "arrow 58.3.0", + "arrow", "async-trait", "chrono", "datafusion-common", @@ -2524,7 +2350,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d7c3adf3db8bf61e92eb90cb659c8e8b734593a8f7c8e12a843c7ddba24b87e" dependencies = [ - "arrow 58.3.0", + "arrow", "datafusion-common", "indexmap 2.14.0", "itertools 0.14.0", @@ -2537,8 +2363,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f28aa4e10384e782774b10e72aca4d93ef7b31aa653095d9d4536b0a3dbc51b6" dependencies = [ - "arrow 58.3.0", - "arrow-buffer 58.3.0", + "arrow", + "arrow-buffer", "base64", "blake2", "blake3", @@ -2570,7 +2396,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "00aa6217e56098ba84e0a338176fe52f0a84cca398021512c6c8c5eff806d0ad" dependencies = [ "ahash 0.8.12", - "arrow 58.3.0", + "arrow", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -2592,7 +2418,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b511250349407db7c43832ab2de63f5557b19a20dfd236b39ca2c04468b50d47" dependencies = [ "ahash 0.8.12", - "arrow 58.3.0", + "arrow", "datafusion-common", "datafusion-expr-common", "datafusion-physical-expr-common", @@ -2604,8 +2430,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef13a858e20d50f0a9bb5e96e7ac82b4e7597f247515bccca4fdd2992df0212a" dependencies = [ - "arrow 58.3.0", - "arrow-ord 58.3.0", + "arrow", + "arrow-ord", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -2629,7 +2455,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b40d3f5bbb3905f9ccb1ce9485a9595c77b69758a7c24d3ba79e334ff51e7e" dependencies = [ - "arrow 58.3.0", + "arrow", "async-trait", "datafusion-catalog", "datafusion-common", @@ -2645,7 +2471,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4e88ec9d57c9b685d02f58bfee7be62d72610430ddcedb82a08e5d9925dbfb6" dependencies = [ - "arrow 58.3.0", + "arrow", "datafusion-common", "datafusion-doc", "datafusion-expr", @@ -2684,7 +2510,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e929015451a67f77d9d8b727b2bf3a40c4445fdef6cdc53281d7d97c76888ace" dependencies = [ - "arrow 58.3.0", + "arrow", "chrono", "datafusion-common", "datafusion-expr", @@ -2705,7 +2531,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b1e68aba7a4b350401cfdf25a3d6f989ad898a7410164afe9ca52080244cb59" dependencies = [ "ahash 0.8.12", - "arrow 58.3.0", + "arrow", "datafusion-common", "datafusion-expr", "datafusion-expr-common", @@ -2728,7 +2554,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea22315f33cf2e0adc104e8ec42e285f6ed93998d565c65e82fec6a9ee9f9db4" dependencies = [ - "arrow 58.3.0", + "arrow", "datafusion-common", "datafusion-expr", "datafusion-functions", @@ -2744,7 +2570,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b04b45ea8ad3ac2d78f2ea2a76053e06591c9629c7a603eda16c10649ecf4362" dependencies = [ "ahash 0.8.12", - "arrow 58.3.0", + "arrow", "chrono", "datafusion-common", "datafusion-expr-common", @@ -2760,7 +2586,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7cb13397809a425918f608dfe8653f332015a3e330004ab191b4404187238b95" dependencies = [ - "arrow 58.3.0", + "arrow", "datafusion-common", "datafusion-execution", "datafusion-expr", @@ -2780,9 +2606,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5edc023675791af9d5fb4cc4c24abf5f7bd3bd4dcf9e5bd90ea1eff6976dcc79" dependencies = [ "ahash 0.8.12", - "arrow 58.3.0", - "arrow-ord 58.3.0", - "arrow-schema 58.3.0", + "arrow", + "arrow-ord", + "arrow-schema", "async-trait", "datafusion-common", "datafusion-common-runtime", @@ -2811,7 +2637,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac8c76860e355616555081cab5968cec1af7a80701ff374510860bcd567e365a" dependencies = [ - "arrow 58.3.0", + "arrow", "datafusion-common", "datafusion-datasource", "datafusion-expr-common", @@ -2842,7 +2668,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e059dcf8544da0d6598d0235be3cc29c209094a5976b2e4822e4a2cf91c2b5c5" dependencies = [ - "arrow 58.3.0", + "arrow", "bigdecimal", "chrono", "crc32fast", @@ -2869,7 +2695,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa0d133ddf8b9b3b872acac900157f783e7b879fe9a6bccf389abebbfac45ec1" dependencies = [ - "arrow 58.3.0", + "arrow", "bigdecimal", "chrono", "datafusion-common", @@ -2888,7 +2714,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04e5a4a7a49143a68936992b6dbb0db44121c635e9992b2482817278f1e69c56" dependencies = [ - "arrow 58.3.0", + "arrow", "async-trait", "bigdecimal", "clap", @@ -3085,7 +2911,7 @@ version = "1.10502.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fdc796383b176dd5a45353fbb5e64583c0ee4da12cb62c9e510b785324b2488" dependencies = [ - "arrow 58.3.0", + "arrow", "cast", "comfy-table", "fallible-iterator", @@ -3440,7 +3266,7 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "83cf860f6a6bf0a6a60fdfe5a36c75121fad5ea4332d1d12deee3e65b6047727" dependencies = [ - "arrow-array 58.3.0", + "arrow-array", "rand 0.9.4", ] @@ -4475,16 +4301,16 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d34e854994e84d043897f5ec9fb609221e9e69e3fd52996cd715d979fcd349f6" dependencies = [ - "arrow 58.3.0", - "arrow-arith 58.3.0", - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-cast 58.3.0", - "arrow-ipc 58.3.0", - "arrow-ord 58.3.0", - "arrow-row 58.3.0", - "arrow-schema 58.3.0", - "arrow-select 58.3.0", + "arrow", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-ipc", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", "async-recursion", "async-trait", "async_cell", @@ -4543,14 +4369,14 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7827fe404358c27d120ee8ea8ef7b9415c2911d54072bec83dd689d750ae65da" dependencies = [ - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-cast 58.3.0", - "arrow-data 58.3.0", - "arrow-ipc 58.3.0", - "arrow-ord 58.3.0", - "arrow-schema 58.3.0", - "arrow-select 58.3.0", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-ord", + "arrow-schema", + "arrow-select", "bytes", "futures", "getrandom 0.2.17", @@ -4565,13 +4391,13 @@ name = "lance-bench" version = "0.1.0" dependencies = [ "anyhow", - "arrow-cast 58.3.0", + "arrow-cast", "async-trait", "clap", "futures", "lance", "lance-encoding", - "parquet 58.3.0", + "parquet", "tempfile", "tokio", "tracing", @@ -4595,9 +4421,9 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b128c213c676cb8e03c62a68670642770825171e64097cc2da97cbb19fe35d29" dependencies = [ - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-schema 58.3.0", + "arrow-array", + "arrow-buffer", + "arrow-schema", "async-trait", "byteorder", "bytes", @@ -4634,13 +4460,13 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e03b2de71cbcd09b10bf1a17c83cacbc0176ecd97203fb72b9e59d9b8f9a3743" dependencies = [ - "arrow 58.3.0", - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-cast 58.3.0", - "arrow-ord 58.3.0", - "arrow-schema 58.3.0", - "arrow-select 58.3.0", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-ord", + "arrow-schema", + "arrow-select", "async-trait", "chrono", "datafusion", @@ -4667,10 +4493,10 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2fe7c7ea7fd397e495a1646fec360e46ee0cbd75718f1c0e887aad657c5f2944" dependencies = [ - "arrow 58.3.0", - "arrow-array 58.3.0", - "arrow-cast 58.3.0", - "arrow-schema 58.3.0", + "arrow", + "arrow-array", + "arrow-cast", + "arrow-schema", "chrono", "futures", "half", @@ -4687,13 +4513,13 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fe3f8070835b407d8db9ea8728386bc3207ba23c66a9c22d344e231ef12b77ca" dependencies = [ - "arrow-arith 58.3.0", - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-cast 58.3.0", - "arrow-data 58.3.0", - "arrow-schema 58.3.0", - "arrow-select 58.3.0", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "arrow-select", "bytemuck", "byteorder", "bytes", @@ -4726,12 +4552,12 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6dfcf654549330df3aef708cd7c12e170feecddd34d6c19dd005b4153213268" dependencies = [ - "arrow-arith 58.3.0", - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-data 58.3.0", - "arrow-schema 58.3.0", - "arrow-select 58.3.0", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", "async-recursion", "async-trait", "byteorder", @@ -4760,12 +4586,12 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4fb8ad0bd10efa2608634a2518b7dd501231e76c56a65fbd6519e23914cc425a" dependencies = [ - "arrow 58.3.0", - "arrow-arith 58.3.0", - "arrow-array 58.3.0", - "arrow-ord 58.3.0", - "arrow-schema 58.3.0", - "arrow-select 58.3.0", + "arrow", + "arrow-arith", + "arrow-array", + "arrow-ord", + "arrow-schema", + "arrow-select", "async-channel", "async-recursion", "async-trait", @@ -4826,14 +4652,14 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef5314703fa8c8baed04193cc669da80ab42521c6319d3cc921a4a997690dcc0" dependencies = [ - "arrow 58.3.0", - "arrow-arith 58.3.0", - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-cast 58.3.0", - "arrow-data 58.3.0", - "arrow-schema 58.3.0", - "arrow-select 58.3.0", + "arrow", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "arrow-select", "async-recursion", "async-trait", "byteorder", @@ -4868,9 +4694,9 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51aa9b73279f505b2bec0f194c7a2390ca74ad3260131e631a7bef8d97d54b2e" dependencies = [ - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-schema 58.3.0", + "arrow-array", + "arrow-buffer", + "arrow-schema", "cc", "deepsize", "half", @@ -4886,7 +4712,7 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cd01581f55ce45c49cbe494ee86c7ba7ca4ca3654690fd820941cd9105a46e" dependencies = [ - "arrow 58.3.0", + "arrow", "async-trait", "bytes", "lance-core", @@ -4915,11 +4741,11 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5db70650465a1af174b7dfe6948ec91a3d466ada12e11274eb66e51132173aa0" dependencies = [ - "arrow 58.3.0", - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-ipc 58.3.0", - "arrow-schema 58.3.0", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-ipc", + "arrow-schema", "async-trait", "byteorder", "bytes", @@ -5282,15 +5108,6 @@ dependencies = [ "libc", ] -[[package]] -name = "lz4_flex" -version = "0.12.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90071f8077f8e40adfc4b7fe9cd495ce316263f19e75c2211eeff3fdf475a3d9" -dependencies = [ - "twox-hash", -] - [[package]] name = "lz4_flex" version = "0.13.1" @@ -5910,15 +5727,8 @@ checksum = "cfe21416a02c693fb9f980befcb230ecc70b0b3d1cc4abf88b9675c4c1457f0c" name = "onpair" version = "0.0.3" dependencies = [ - "arrow-array 57.3.1", - "arrow-schema 57.3.1", - "codspeed-divan-compat", "hashbrown 0.16.1", - "parquet 57.3.1", "rand 0.9.4", - "rstest", - "tpchgen 2.0.2 (registry+https://github.com/rust-lang/crates.io-index)", - "tpchgen-arrow 2.0.2 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -6101,40 +5911,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "parquet" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e832c6aa20310fc6de7ea5a3f4e20d34fd83e3b43229d32b81ffe5c14d74692" -dependencies = [ - "ahash 0.8.12", - "arrow-array 57.3.1", - "arrow-buffer 57.3.1", - "arrow-cast 57.3.1", - "arrow-data 57.3.1", - "arrow-ipc 57.3.1", - "arrow-schema 57.3.1", - "arrow-select 57.3.1", - "base64", - "brotli", - "bytes", - "chrono", - "flate2", - "half", - "hashbrown 0.16.1", - "lz4_flex 0.12.2", - "num-bigint", - "num-integer", - "num-traits", - "paste", - "seq-macro", - "simdutf8", - "snap", - "thrift", - "twox-hash", - "zstd", -] - [[package]] name = "parquet" version = "58.3.0" @@ -6142,12 +5918,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5dafa7d01085b62a47dd0c1829550a0a36710ea9c4fe358a05a85477cec8a908" dependencies = [ "ahash 0.8.12", - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-data 58.3.0", - "arrow-ipc 58.3.0", - "arrow-schema 58.3.0", - "arrow-select 58.3.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "arrow-select", "base64", "brotli", "bytes", @@ -6156,7 +5932,7 @@ dependencies = [ "futures", "half", "hashbrown 0.17.1", - "lz4_flex 0.13.1", + "lz4_flex", "num-bigint", "num-integer", "num-traits", @@ -6177,8 +5953,8 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74c8db065291f088a2aad8ab831853eae1871c0d311c8d0b83bbc3b7e735d0fc" dependencies = [ - "arrow 58.3.0", - "arrow-schema 58.3.0", + "arrow", + "arrow-schema", "chrono", "half", "indexmap 2.14.0", @@ -6193,8 +5969,8 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a530e8d5b5e14efcb39c9a6ec55432ad11f6afb7dc4455a79be0dc615fe3cc31" dependencies = [ - "arrow 58.3.0", - "arrow-schema 58.3.0", + "arrow", + "arrow-schema", "chrono", "half", "indexmap 2.14.0", @@ -6210,7 +5986,7 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "00ed89908289f67caa2ca078f9ff9aacd6229a313ec92b12bf4f48f613dc2b97" dependencies = [ - "arrow-schema 58.3.0", + "arrow-schema", "base64", "chrono", "parquet-variant", @@ -8939,34 +8715,18 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" -[[package]] -name = "tpchgen" -version = "2.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d651db770ccf53b89dd769ed47899c0c089452e3b725c3c48fbc6a2be579638" - [[package]] name = "tpchgen" version = "2.0.2" source = "git+https://github.com/clflushopt/tpchgen-rs.git?rev=438e9c2dbc25b2fff82c0efc08b3f13b5707874f#438e9c2dbc25b2fff82c0efc08b3f13b5707874f" -[[package]] -name = "tpchgen-arrow" -version = "2.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "180f3759dffbf26d47021d2a84245a00f20945384bcf22e63c32652b04916e5a" -dependencies = [ - "arrow 57.3.1", - "tpchgen 2.0.2 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "tpchgen-arrow" version = "2.0.2" source = "git+https://github.com/clflushopt/tpchgen-rs.git?rev=438e9c2dbc25b2fff82c0efc08b3f13b5707874f#438e9c2dbc25b2fff82c0efc08b3f13b5707874f" dependencies = [ - "arrow 58.3.0", - "tpchgen 2.0.2 (git+https://github.com/clflushopt/tpchgen-rs.git?rev=438e9c2dbc25b2fff82c0efc08b3f13b5707874f)", + "arrow", + "tpchgen", ] [[package]] @@ -9296,13 +9056,13 @@ name = "vector-search-bench" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-schema 58.3.0", + "arrow-array", + "arrow-buffer", + "arrow-schema", "clap", "futures", "indicatif", - "parquet 58.3.0", + "parquet", "rand 0.10.1", "serde", "tabled", @@ -9326,12 +9086,12 @@ name = "vortex" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array 58.3.0", + "arrow-array", "codspeed-divan-compat", "fastlanes", "futures", "mimalloc", - "parquet 58.3.0", + "parquet", "paste", "rand 0.10.1", "rand_distr 0.6.0", @@ -9398,15 +9158,15 @@ dependencies = [ "arbitrary", "arc-swap", "arcref", - "arrow-arith 58.3.0", - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-cast 58.3.0", - "arrow-data 58.3.0", - "arrow-ord 58.3.0", - "arrow-schema 58.3.0", - "arrow-select 58.3.0", - "arrow-string 58.3.0", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ord", + "arrow-schema", + "arrow-select", + "arrow-string", "async-lock", "bytes", "cfg-if", @@ -9470,9 +9230,9 @@ name = "vortex-bench" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array 58.3.0", - "arrow-schema 58.3.0", - "arrow-select 58.3.0", + "arrow-array", + "arrow-schema", + "arrow-select", "async-trait", "bytes", "bzip2", @@ -9488,7 +9248,7 @@ dependencies = [ "noodles-bgzf", "noodles-vcf", "parking_lot", - "parquet 58.3.0", + "parquet", "rand 0.10.1", "regex", "reqwest 0.13.4", @@ -9500,8 +9260,8 @@ dependencies = [ "tokio", "tokio-stream", "tokio-util", - "tpchgen 2.0.2 (git+https://github.com/clflushopt/tpchgen-rs.git?rev=438e9c2dbc25b2fff82c0efc08b3f13b5707874f)", - "tpchgen-arrow 2.0.2 (git+https://github.com/clflushopt/tpchgen-rs.git?rev=438e9c2dbc25b2fff82c0efc08b3f13b5707874f)", + "tpchgen", + "tpchgen-arrow", "tracing", "tracing-perfetto", "tracing-subscriber", @@ -9516,9 +9276,9 @@ name = "vortex-bench-migrate" version = "0.1.0-alpha.0" dependencies = [ "anyhow", - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-schema 58.3.0", + "arrow-array", + "arrow-buffer", + "arrow-schema", "clap", "duckdb", "flate2", @@ -9604,7 +9364,7 @@ dependencies = [ name = "vortex-buffer" version = "0.1.0" dependencies = [ - "arrow-buffer 58.3.0", + "arrow-buffer", "bitvec", "bytes", "codspeed-divan-compat", @@ -9635,21 +9395,21 @@ dependencies = [ name = "vortex-compat" version = "0.1.0" dependencies = [ - "arrow-array 58.3.0", - "arrow-select 58.3.0", + "arrow-array", + "arrow-select", "base16ct", "bytes", "clap", "futures", - "parquet 58.3.0", + "parquet", "reqwest 0.13.4", "serde", "serde_json", "sha2 0.11.0", "tempfile", "tokio", - "tpchgen 2.0.2 (git+https://github.com/clflushopt/tpchgen-rs.git?rev=438e9c2dbc25b2fff82c0efc08b3f13b5707874f)", - "tpchgen-arrow 2.0.2 (git+https://github.com/clflushopt/tpchgen-rs.git?rev=438e9c2dbc25b2fff82c0efc08b3f13b5707874f)", + "tpchgen", + "tpchgen-arrow", "vortex", "vortex-array", "vortex-buffer", @@ -9694,7 +9454,7 @@ name = "vortex-cuda" version = "0.1.0" dependencies = [ "arc-swap", - "arrow-schema 58.3.0", + "arrow-schema", "async-trait", "bindgen", "bytes", @@ -9734,8 +9494,8 @@ name = "vortex-cxx" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array 58.3.0", - "arrow-schema 58.3.0", + "arrow-array", + "arrow-schema", "async-fs", "cxx", "futures", @@ -9749,7 +9509,7 @@ name = "vortex-datafusion" version = "0.1.0" dependencies = [ "anyhow", - "arrow-schema 58.3.0", + "arrow-schema", "async-trait", "datafusion", "datafusion-catalog", @@ -9845,7 +9605,7 @@ dependencies = [ name = "vortex-error" version = "0.1.0" dependencies = [ - "arrow-schema 58.3.0", + "arrow-schema", "flatbuffers", "jiff", "object_store 0.13.2", @@ -9880,8 +9640,8 @@ dependencies = [ name = "vortex-ffi" version = "0.1.0" dependencies = [ - "arrow-array 58.3.0", - "arrow-schema 58.3.0", + "arrow-array", + "arrow-schema", "async-fs", "cbindgen", "futures", @@ -10049,8 +9809,8 @@ dependencies = [ name = "vortex-jni" version = "0.1.0" dependencies = [ - "arrow-array 58.3.0", - "arrow-schema 58.3.0", + "arrow-array", + "arrow-schema", "async-fs", "futures", "jni", @@ -10068,8 +9828,8 @@ name = "vortex-layout" version = "0.1.0" dependencies = [ "arcref", - "arrow-array 58.3.0", - "arrow-schema 58.3.0", + "arrow-array", + "arrow-schema", "async-stream", "async-trait", "bit-vec", @@ -10110,7 +9870,7 @@ dependencies = [ name = "vortex-mask" version = "0.1.0" dependencies = [ - "arrow-buffer 58.3.0", + "arrow-buffer", "codspeed-divan-compat", "itertools 0.14.0", "rstest", @@ -10160,9 +9920,9 @@ dependencies = [ name = "vortex-parquet-variant" version = "0.1.0" dependencies = [ - "arrow-array 58.3.0", - "arrow-buffer 58.3.0", - "arrow-schema 58.3.0", + "arrow-array", + "arrow-buffer", + "arrow-schema", "chrono", "parquet-variant", "parquet-variant-compute", @@ -10206,9 +9966,9 @@ dependencies = [ name = "vortex-python" version = "0.1.0" dependencies = [ - "arrow-array 58.3.0", - "arrow-data 58.3.0", - "arrow-schema 58.3.0", + "arrow-array", + "arrow-data", + "arrow-schema", "async-fs", "bytes", "itertools 0.14.0", @@ -10231,8 +9991,8 @@ name = "vortex-runend" version = "0.1.0" dependencies = [ "arbitrary", - "arrow-array 58.3.0", - "arrow-schema 58.3.0", + "arrow-array", + "arrow-schema", "codspeed-divan-compat", "itertools 0.14.0", "num-traits", @@ -10330,8 +10090,8 @@ dependencies = [ name = "vortex-tensor" version = "0.1.0" dependencies = [ - "arrow-array 58.3.0", - "arrow-schema 58.3.0", + "arrow-array", + "arrow-schema", "codspeed-divan-compat", "half", "itertools 0.14.0", @@ -10355,8 +10115,8 @@ dependencies = [ name = "vortex-test-e2e-cuda" version = "0.1.0" dependencies = [ - "arrow-array 58.3.0", - "arrow-schema 58.3.0", + "arrow-array", + "arrow-schema", "futures", "vortex", "vortex-cuda", @@ -10367,8 +10127,8 @@ name = "vortex-tui" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array 58.3.0", - "arrow-schema 58.3.0", + "arrow-array", + "arrow-schema", "clap", "console_error_panic_hook", "crossterm 0.29.0", @@ -10381,7 +10141,7 @@ dependencies = [ "indicatif", "itertools 0.14.0", "js-sys", - "parquet 58.3.0", + "parquet", "ratatui", "ratzilla", "serde", @@ -10431,9 +10191,9 @@ dependencies = [ name = "vortex-web-wasm" version = "0.1.0" dependencies = [ - "arrow-array 58.3.0", - "arrow-ipc 58.3.0", - "arrow-schema 58.3.0", + "arrow-array", + "arrow-ipc", + "arrow-schema", "console_error_panic_hook", "futures", "js-sys", From 4b7af858760dda9f25507a0b1b02abd333a760c4 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 21:49:02 +0100 Subject: [PATCH 20/27] fix Signed-off-by: Joe Isaacs --- Cargo.lock | 1 + Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 5c8fc94c8dc..fbf10e8c9c8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5726,6 +5726,7 @@ checksum = "cfe21416a02c693fb9f980befcb230ecc70b0b3d1cc4abf88b9675c4c1457f0c" [[package]] name = "onpair" version = "0.0.3" +source = "git+https://github.com/spiraldb/onpair.git?rev=8d4009dc5a2c31251b9c2d657078fe8b09d3a02c#8d4009dc5a2c31251b9c2d657078fe8b09d3a02c" dependencies = [ "hashbrown 0.16.1", "rand 0.9.4", diff --git a/Cargo.toml b/Cargo.toml index a696d96d8ae..1aeccacc9ba 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -190,7 +190,7 @@ num_enum = { version = "0.7.3", default-features = false } object_store = { version = "0.13.1", default-features = false } once_cell = "1.21" oneshot = { version = "0.2.0", features = ["async"] } -onpair = { path = "../onpair" } +onpair = { version = "0.0.3", git = "https://github.com/spiraldb/onpair.git", rev = "8d4009dc5a2c31251b9c2d657078fe8b09d3a02c" } opentelemetry = "0.32.0" opentelemetry-otlp = "0.32.0" opentelemetry_sdk = "0.32.0" From e5aca428e9c0a031f80a711cb57855e9e40915d7 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 22:18:52 +0100 Subject: [PATCH 21/27] perf Signed-off-by: Joe Isaacs --- encodings/experimental/onpair/src/array.rs | 2 +- encodings/experimental/onpair/src/compress.rs | 20 ++++++++++--------- encodings/experimental/onpair/src/lib.rs | 7 ------- encodings/experimental/onpair/src/ops.rs | 10 +++++++++- 4 files changed, 21 insertions(+), 18 deletions(-) diff --git a/encodings/experimental/onpair/src/array.rs b/encodings/experimental/onpair/src/array.rs index 188fff272de..057a13ec33a 100644 --- a/encodings/experimental/onpair/src/array.rs +++ b/encodings/experimental/onpair/src/array.rs @@ -130,7 +130,7 @@ pub struct OnPairData { /// /// INVARIANT: this buffer must be over-padded past its logical end /// (`dict_offsets.last()`) by the decoder's fixed token read width, - /// [`MAX_TOKEN_SIZE`][crate::MAX_TOKEN_SIZE]. The over-copy decoder reads + /// [`MAX_TOKEN_SIZE`][onpair::MAX_TOKEN_SIZE]. The over-copy decoder reads /// every dictionary entry with one fixed-width load and then advances the /// cursor by the token's true length, so the load for the final, shortest /// token over-reads past the logical end of the dictionary. This is the diff --git a/encodings/experimental/onpair/src/compress.rs b/encodings/experimental/onpair/src/compress.rs index 4c3de89f316..d75f440601e 100644 --- a/encodings/experimental/onpair/src/compress.rs +++ b/encodings/experimental/onpair/src/compress.rs @@ -66,7 +66,7 @@ where let column = onpair::compress(&flat, &offsets, config) .map_err(|e| vortex_err!("OnPair compress failed: {e}"))?; let (bits, dict_bytes, dict_offsets, codes, codes_offsets) = - parts_to_children(&column, &offsets)?; + parts_to_children(column, &offsets)?; let uncompressed_lengths = uncompressed_lengths.into_array(); let validity = match dtype.nullability() { @@ -93,16 +93,16 @@ where /// to [`onpair::compress`]; they let us recover the per-row *code* boundaries /// (the `codes_offsets` child), which the `onpair` crate no longer returns. fn parts_to_children( - column: &Column, + column: Column, row_byte_offsets: &[u64], ) -> VortexResult<(u32, BufferHandle, ArrayRef, ArrayRef, ArrayRef)> { let bits = column.bits; // Pad the dictionary blob with MAX_TOKEN_SIZE zero bytes so the // over-copy decoder can issue a fixed 16-byte load for every token // without risking an OOB read on the last entry. - let mut padded = Vec::with_capacity(column.dict_bytes.len() + crate::MAX_TOKEN_SIZE); + let mut padded = Vec::with_capacity(column.dict_bytes.len() + onpair::MAX_TOKEN_SIZE); padded.extend_from_slice(&column.dict_bytes); - padded.resize(column.dict_bytes.len() + crate::MAX_TOKEN_SIZE, 0); + padded.resize(column.dict_bytes.len() + onpair::MAX_TOKEN_SIZE, 0); // Align dict_bytes to 8 bytes so the segment that ultimately holds the // OnPair tree starts at an 8-aligned in-memory address. Without this // anchor, the per-buffer padding the serializer inserts is only @@ -112,9 +112,6 @@ fn parts_to_children( let dict_bytes = BufferHandle::new_host(ByteBuffer::from(padded).aligned(vortex_buffer::Alignment::new(8))); - // The crate emits already-unpacked token codes (one `u16` per token), so - // they map straight onto the `codes` slot child. - let codes = Buffer::::copy_from(column.codes.as_slice()).into_array(); // Recover the per-row code boundaries. `onpair::compress` no longer returns // them, but its tokenizer never lets a token span a row boundary, so a // row's codes decode to exactly its byte span. Walk `codes`, summing each @@ -122,14 +119,19 @@ fn parts_to_children( // row when the accumulated decoded length reaches that row's byte offset. let codes_offsets = build_codes_offsets(&column.codes, &column.dict_offsets, row_byte_offsets)?; - let dict_offsets = Buffer::::copy_from(column.dict_offsets.as_slice()).into_array(); - let codes_offsets = Buffer::::copy_from(codes_offsets).into_array(); + // `column` owns its `codes`/`dict_offsets` vectors (and the crate emits + // already-unpacked `u16` token codes), so move them straight onto the slot + // children instead of copying. + let codes = Buffer::from(column.codes).into_array(); + let dict_offsets = Buffer::from(column.dict_offsets).into_array(); + let codes_offsets = Buffer::from(codes_offsets).into_array(); Ok((bits, dict_bytes, dict_offsets, codes, codes_offsets)) } /// Reconstruct the per-row `codes_offsets` from the flat `codes`, the /// dictionary `dict_offsets` (token byte lengths) and the per-row decoded byte /// boundaries. Returns `nrows + 1` cumulative code counts (`u32`). +// TODO(joe): can we compute this while compressing the array, yes but a worse API. fn build_codes_offsets( codes: &[u16], dict_offsets: &[u32], diff --git a/encodings/experimental/onpair/src/lib.rs b/encodings/experimental/onpair/src/lib.rs index 719d5b0eab2..fa90ac9acb1 100644 --- a/encodings/experimental/onpair/src/lib.rs +++ b/encodings/experimental/onpair/src/lib.rs @@ -23,10 +23,3 @@ mod tests; pub use array::*; pub use compress::*; - -/// Fixed token-byte over-copy width. Matches the `onpair` crate's `MAX_TOKEN_SIZE`: -/// the decoder copies exactly this many bytes per token and advances the -/// output cursor by the *true* token length. Lets the compiler emit a single -/// 128-bit SIMD store per token on x86_64 / aarch64 instead of a -/// variable-length memcpy. -pub const MAX_TOKEN_SIZE: usize = 16; diff --git a/encodings/experimental/onpair/src/ops.rs b/encodings/experimental/onpair/src/ops.rs index 2272f6f5fca..a6e097bbfd4 100644 --- a/encodings/experimental/onpair/src/ops.rs +++ b/encodings/experimental/onpair/src/ops.rs @@ -9,6 +9,7 @@ use vortex_array::scalar::Scalar; use vortex_array::vtable::OperationsVTable; use vortex_buffer::ByteBuffer; use vortex_error::VortexResult; +use vortex_error::vortex_err; use crate::OnPair; use crate::OnPairArraySlotsExt; @@ -39,7 +40,14 @@ impl OperationsVTable for OnPair { codes: codes.as_slice(), }; - let len = onpair::decompressed_len(parts); + // The per-row decoded length is recorded in the `uncompressed_lengths` + // child, so read it directly instead of asking the decoder to compute it. + let len = array + .uncompressed_lengths() + .execute_scalar(index, ctx)? + .as_primitive() + .as_::() + .ok_or_else(|| vortex_err!("OnPair uncompressed_lengths[{index}] is null"))?; let mut buf: Vec = Vec::with_capacity(len); let written = onpair::decompress_into(parts, buf.spare_capacity_mut()); debug_assert_eq!(written, len); From 080da5bc059ec7e41da4837755bb5cbea53ffafe Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 22:32:09 +0100 Subject: [PATCH 22/27] perf Signed-off-by: Joe Isaacs --- encodings/experimental/onpair/src/array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/encodings/experimental/onpair/src/array.rs b/encodings/experimental/onpair/src/array.rs index 057a13ec33a..1425420f37f 100644 --- a/encodings/experimental/onpair/src/array.rs +++ b/encodings/experimental/onpair/src/array.rs @@ -54,7 +54,7 @@ pub type OnPairArray = Array; /// On disk the layout is FSST-shape: /// /// * Buffer 0 — `dict_bytes`: the dictionary blob built by the OnPair trainer, -/// padded with [`MAX_TOKEN_SIZE`][crate::MAX_TOKEN_SIZE] trailing zero +/// padded with [`MAX_TOKEN_SIZE`][onpair::MAX_TOKEN_SIZE] trailing zero /// bytes so the over-copy decoder can read 16 bytes past the last token. /// * Slots — see [`OnPairSlots`]. /// From 99ed3f44d719f508171bfafcebebbc043c32fabf Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 22:36:04 +0100 Subject: [PATCH 23/27] perf Signed-off-by: Joe Isaacs --- encodings/experimental/onpair/src/array.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/encodings/experimental/onpair/src/array.rs b/encodings/experimental/onpair/src/array.rs index 1425420f37f..18fa5e942ce 100644 --- a/encodings/experimental/onpair/src/array.rs +++ b/encodings/experimental/onpair/src/array.rs @@ -54,7 +54,7 @@ pub type OnPairArray = Array; /// On disk the layout is FSST-shape: /// /// * Buffer 0 — `dict_bytes`: the dictionary blob built by the OnPair trainer, -/// padded with [`MAX_TOKEN_SIZE`][onpair::MAX_TOKEN_SIZE] trailing zero +/// padded with `onpair::MAX_TOKEN_SIZE` trailing zero /// bytes so the over-copy decoder can read 16 bytes past the last token. /// * Slots — see [`OnPairSlots`]. /// @@ -130,7 +130,7 @@ pub struct OnPairData { /// /// INVARIANT: this buffer must be over-padded past its logical end /// (`dict_offsets.last()`) by the decoder's fixed token read width, - /// [`MAX_TOKEN_SIZE`][onpair::MAX_TOKEN_SIZE]. The over-copy decoder reads + /// `onpair::MAX_TOKEN_SIZE`. The over-copy decoder reads /// every dictionary entry with one fixed-width load and then advances the /// cursor by the token's true length, so the load for the final, shortest /// token over-reads past the logical end of the dictionary. This is the From b173497277a14627bd07e387d476cac2d6be4ce7 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 23:06:26 +0100 Subject: [PATCH 24/27] perf Signed-off-by: Joe Isaacs --- encodings/experimental/onpair/src/compress.rs | 15 ++-- .../experimental/onpair/src/compute/filter.rs | 89 ++++++------------- 2 files changed, 31 insertions(+), 73 deletions(-) diff --git a/encodings/experimental/onpair/src/compress.rs b/encodings/experimental/onpair/src/compress.rs index d75f440601e..b732fa08532 100644 --- a/encodings/experimental/onpair/src/compress.rs +++ b/encodings/experimental/onpair/src/compress.rs @@ -88,10 +88,6 @@ where /// Lift a compressed [`Column`] into Vortex children + the dict buffer. /// Returns `(bits, dict_bytes_buffer, dict_offsets_child, codes_child, codes_offsets_child)`. -/// -/// `row_byte_offsets` are the per-row decoded byte boundaries that were handed -/// to [`onpair::compress`]; they let us recover the per-row *code* boundaries -/// (the `codes_offsets` child), which the `onpair` crate no longer returns. fn parts_to_children( column: Column, row_byte_offsets: &[u64], @@ -112,11 +108,12 @@ fn parts_to_children( let dict_bytes = BufferHandle::new_host(ByteBuffer::from(padded).aligned(vortex_buffer::Alignment::new(8))); - // Recover the per-row code boundaries. `onpair::compress` no longer returns - // them, but its tokenizer never lets a token span a row boundary, so a - // row's codes decode to exactly its byte span. Walk `codes`, summing each - // token's byte length (`dict_offsets[c+1] - dict_offsets[c]`), and close a - // row when the accumulated decoded length reaches that row's byte offset. + // Recover the per-row code boundaries. The resolved `onpair` crate does not + // return them from `compress`, but its tokenizer never lets a token span a + // row boundary, so a row's codes decode to exactly its byte span. Walk + // `codes`, summing each token's byte length (`dict_offsets[c+1] - + // dict_offsets[c]`), and close a row when the accumulated decoded length + // reaches that row's byte offset. let codes_offsets = build_codes_offsets(&column.codes, &column.dict_offsets, row_byte_offsets)?; // `column` owns its `codes`/`dict_offsets` vectors (and the crate emits diff --git a/encodings/experimental/onpair/src/compute/filter.rs b/encodings/experimental/onpair/src/compute/filter.rs index 736528f956a..142546d3b0e 100644 --- a/encodings/experimental/onpair/src/compute/filter.rs +++ b/encodings/experimental/onpair/src/compute/filter.rs @@ -10,19 +10,20 @@ //! //! FSST-shape filter: keep `dict_bytes` + `dict_offsets` **identical** //! to the input; rebuild only `codes`, `codes_offsets`, -//! `uncompressed_lengths`, and validity by walking the mask. No decode, -//! no retrain on the read path. +//! `uncompressed_lengths`, and validity. No decode, no retrain on the +//! read path. use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; -use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::List; +use vortex_array::arrays::ListArray; use vortex_array::arrays::filter::FilterKernel; -use vortex_array::match_each_integer_ptype; -use vortex_buffer::BufferMut; +use vortex_array::arrays::list::ListArrayExt; +use vortex_array::validity::Validity; +use vortex_error::VortexExpect; use vortex_error::VortexResult; -use vortex_error::vortex_err; use vortex_mask::Mask; use crate::OnPair; @@ -36,62 +37,22 @@ impl FilterKernel for OnPair { mask: &Mask, ctx: &mut ExecutionCtx, ) -> VortexResult> { - let n_in = array.array().len(); - let n_out = mask.true_count(); - - // Materialise the per-row offset arrays we walk during filtering. - // The codes themselves we read through whatever ptype the - // cascading compressor narrowed to — match_each_integer_ptype - // dispatches on it below. - let codes_offsets_arr = array - .codes_offsets() - .clone() - .execute::(ctx)?; - let codes_arr = array.codes().clone().execute::(ctx)?; - - let mut new_codes_offsets = BufferMut::::with_capacity(n_out + 1); - - // The cascading compressor may have narrowed `codes_offsets` - // (e.g. u32 → u16 if every row's token count is small). Read - // through whatever ptype it lives at — the values still fit in - // `usize` when widened. Likewise for `codes`. - let new_codes: ArrayRef = match_each_integer_ptype!(codes_offsets_arr.ptype(), |OP| { - let codes_offsets = codes_offsets_arr.as_slice::(); - - // First pass: sum the surviving token count so we reserve once. - let mut new_codes_len: usize = 0; - for r in 0..n_in { - if mask.value(r) { - new_codes_len += (codes_offsets[r + 1] as usize) - (codes_offsets[r] as usize); - } - } - - // SAFETY: capacity reserved. - unsafe { new_codes_offsets.push_unchecked(0u32) }; - - match_each_integer_ptype!(codes_arr.ptype(), |P| { - let codes = codes_arr.as_slice::

(); - let mut out = BufferMut::

::with_capacity(new_codes_len); - let mut cursor: u32 = 0; - for r in 0..n_in { - if mask.value(r) { - let lo = codes_offsets[r] as usize; - let hi = codes_offsets[r + 1] as usize; - // SAFETY: codes_offsets validated at construction. - let segment = unsafe { codes.get_unchecked(lo..hi) }; - out.extend_from_slice(segment); - let segment_len = u32::try_from(hi - lo) - .map_err(|_| vortex_err!("token segment overflows u32"))?; - cursor = cursor - .checked_add(segment_len) - .ok_or_else(|| vortex_err!("codes_offsets overflow u32"))?; - // SAFETY: capacity reserved (n_out + 1 entries). - unsafe { new_codes_offsets.push_unchecked(cursor) }; - } - } - out.freeze().into_array() - }) - }); + // OnPair's `codes` + `codes_offsets` are a list of token runs, + // analogous to FSST's `codes` VarBin child. Delegate to the standard + // List filter so sparse masks can filter the encoded child directly. + let codes = unsafe { + ListArray::new_unchecked( + array.codes().clone(), + array.codes_offsets().clone(), + Validity::NonNullable, + ) + }; + let filtered_codes_ref = ::filter(codes.as_view(), mask, ctx)? + .vortex_expect("List filter kernel always returns Some"); + let filtered_codes = filtered_codes_ref + .try_downcast::() + .ok() + .vortex_expect("must be List"); // uncompressed_lengths + validity flow through the standard // primitive filter — these are short integer arrays so the cost @@ -105,8 +66,8 @@ impl FilterKernel for OnPair { array.dtype().clone(), array.dict_bytes_handle().clone(), array.dict_offsets().clone(), - new_codes, - new_codes_offsets.freeze().into_array(), + filtered_codes.elements().clone(), + filtered_codes.offsets().clone(), uncompressed_lengths, validity, array.bits(), From 8baa4295a648ae8207bd45b4ed3fe0d6445be23d Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 23:25:30 +0100 Subject: [PATCH 25/27] fix Signed-off-by: Joe Isaacs --- encodings/experimental/onpair/src/compute/filter.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/encodings/experimental/onpair/src/compute/filter.rs b/encodings/experimental/onpair/src/compute/filter.rs index 142546d3b0e..c26f3eeeacd 100644 --- a/encodings/experimental/onpair/src/compute/filter.rs +++ b/encodings/experimental/onpair/src/compute/filter.rs @@ -31,7 +31,6 @@ use crate::OnPairArrayExt; use crate::OnPairArraySlotsExt; impl FilterKernel for OnPair { - #[expect(clippy::cognitive_complexity, clippy::cast_possible_truncation)] fn filter( array: ArrayView<'_, Self>, mask: &Mask, From 95ee612f371b99ba05619887b0d5a478a5b52a28 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 29 May 2026 23:57:10 +0100 Subject: [PATCH 26/27] fix Signed-off-by: Joe Isaacs --- encodings/experimental/onpair/public-api.lock | 2 - encodings/experimental/onpair/src/compress.rs | 74 +++++++++---------- encodings/experimental/onpair/src/kernel.rs | 2 +- 3 files changed, 37 insertions(+), 41 deletions(-) diff --git a/encodings/experimental/onpair/public-api.lock b/encodings/experimental/onpair/public-api.lock index 4278cf82cac..6d1618dceb9 100644 --- a/encodings/experimental/onpair/public-api.lock +++ b/encodings/experimental/onpair/public-api.lock @@ -194,8 +194,6 @@ pub fn vortex_onpair::OnPairSlotsView<'a>::to_owned(&self) -> vortex_onpair::OnP pub const vortex_onpair::DEFAULT_DICT12_CONFIG: onpair::config::Config -pub const vortex_onpair::MAX_TOKEN_SIZE: usize - pub trait vortex_onpair::OnPairArrayExt: vortex_onpair::OnPairArraySlotsExt pub fn vortex_onpair::OnPairArrayExt::array_validity(&self) -> vortex_array::validity::Validity diff --git a/encodings/experimental/onpair/src/compress.rs b/encodings/experimental/onpair/src/compress.rs index b732fa08532..226922cf593 100644 --- a/encodings/experimental/onpair/src/compress.rs +++ b/encodings/experimental/onpair/src/compress.rs @@ -3,8 +3,8 @@ //! Train + compress entry points for the OnPair encoding. -use onpair::Column; use onpair::Config; +use onpair::Offset; use vortex_array::ArrayRef; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; @@ -38,25 +38,38 @@ pub fn onpair_compress_iter<'a, I>( ) -> VortexResult where I: Iterator>, +{ + onpair_compress_iter_with_offsets::(iter, len, dtype, config) +} + +fn onpair_compress_iter_with_offsets<'a, O, I>( + iter: I, + len: usize, + dtype: DType, + config: Config, +) -> VortexResult +where + O: Offset, + I: Iterator>, { let mut flat: Vec = Vec::with_capacity(len * 16); - let mut offsets: Vec = Vec::with_capacity(len + 1); + let mut offsets: Vec = Vec::with_capacity(len + 1); let mut uncompressed_lengths: BufferMut = BufferMut::with_capacity(len); let mut validity_bits: Vec = Vec::with_capacity(len); - offsets.push(0); + offsets.push(::from_usize(0)); for item in iter { match item { Some(bytes) => { flat.extend_from_slice(bytes); - offsets.push(flat.len() as u64); + offsets.push(::from_usize(flat.len())); uncompressed_lengths.push( i32::try_from(bytes.len()).vortex_expect("string length must fit in i32"), ); validity_bits.push(true); } None => { - offsets.push(flat.len() as u64); + offsets.push(::from_usize(flat.len())); uncompressed_lengths.push(0); validity_bits.push(false); } @@ -65,8 +78,12 @@ where let column = onpair::compress(&flat, &offsets, config) .map_err(|e| vortex_err!("OnPair compress failed: {e}"))?; - let (bits, dict_bytes, dict_offsets, codes, codes_offsets) = - parts_to_children(column, &offsets)?; + let bits = column.bits; + let dict_bytes = dict_bytes_to_buffer(column.dict_bytes); + let codes_offsets = build_codes_offsets(&column.codes, &column.dict_offsets, &offsets)?; + let codes = Buffer::from(column.codes).into_array(); + let dict_offsets = Buffer::from(column.dict_offsets).into_array(); + let codes_offsets = Buffer::from(codes_offsets).into_array(); let uncompressed_lengths = uncompressed_lengths.into_array(); let validity = match dtype.nullability() { @@ -86,53 +103,31 @@ where ) } -/// Lift a compressed [`Column`] into Vortex children + the dict buffer. -/// Returns `(bits, dict_bytes_buffer, dict_offsets_child, codes_child, codes_offsets_child)`. -fn parts_to_children( - column: Column, - row_byte_offsets: &[u64], -) -> VortexResult<(u32, BufferHandle, ArrayRef, ArrayRef, ArrayRef)> { - let bits = column.bits; +/// Lift compressed dictionary bytes into the Vortex buffer slot. +fn dict_bytes_to_buffer(dict_bytes: Vec) -> BufferHandle { // Pad the dictionary blob with MAX_TOKEN_SIZE zero bytes so the // over-copy decoder can issue a fixed 16-byte load for every token // without risking an OOB read on the last entry. - let mut padded = Vec::with_capacity(column.dict_bytes.len() + onpair::MAX_TOKEN_SIZE); - padded.extend_from_slice(&column.dict_bytes); - padded.resize(column.dict_bytes.len() + onpair::MAX_TOKEN_SIZE, 0); + let mut padded = Vec::with_capacity(dict_bytes.len() + onpair::MAX_TOKEN_SIZE); + padded.extend_from_slice(&dict_bytes); + padded.resize(dict_bytes.len() + onpair::MAX_TOKEN_SIZE, 0); // Align dict_bytes to 8 bytes so the segment that ultimately holds the // OnPair tree starts at an 8-aligned in-memory address. Without this // anchor, the per-buffer padding the serializer inserts is only // *relative* to the segment start; if the segment lands at a u8-aligned // heap address, downstream `PrimitiveArray::deserialize` panics // with `Misaligned buffer cannot be used to build PrimitiveArray of u32`. - let dict_bytes = - BufferHandle::new_host(ByteBuffer::from(padded).aligned(vortex_buffer::Alignment::new(8))); - - // Recover the per-row code boundaries. The resolved `onpair` crate does not - // return them from `compress`, but its tokenizer never lets a token span a - // row boundary, so a row's codes decode to exactly its byte span. Walk - // `codes`, summing each token's byte length (`dict_offsets[c+1] - - // dict_offsets[c]`), and close a row when the accumulated decoded length - // reaches that row's byte offset. - let codes_offsets = build_codes_offsets(&column.codes, &column.dict_offsets, row_byte_offsets)?; - - // `column` owns its `codes`/`dict_offsets` vectors (and the crate emits - // already-unpacked `u16` token codes), so move them straight onto the slot - // children instead of copying. - let codes = Buffer::from(column.codes).into_array(); - let dict_offsets = Buffer::from(column.dict_offsets).into_array(); - let codes_offsets = Buffer::from(codes_offsets).into_array(); - Ok((bits, dict_bytes, dict_offsets, codes, codes_offsets)) + BufferHandle::new_host(ByteBuffer::from(padded).aligned(vortex_buffer::Alignment::new(8))) } /// Reconstruct the per-row `codes_offsets` from the flat `codes`, the /// dictionary `dict_offsets` (token byte lengths) and the per-row decoded byte /// boundaries. Returns `nrows + 1` cumulative code counts (`u32`). // TODO(joe): can we compute this while compressing the array, yes but a worse API. -fn build_codes_offsets( +fn build_codes_offsets( codes: &[u16], dict_offsets: &[u32], - row_byte_offsets: &[u64], + row_byte_offsets: &[O], ) -> VortexResult> { let nrows = row_byte_offsets.len() - 1; let mut codes_offsets = Vec::with_capacity(nrows + 1); @@ -140,7 +135,10 @@ fn build_codes_offsets( let mut decoded_bytes: u64 = 0; let mut code_idx: usize = 0; for r in 0..nrows { - let target = row_byte_offsets[r + 1]; + let target = row_byte_offsets[r + 1] + .to_usize() + .ok_or_else(|| vortex_err!("OnPair row byte offset does not fit usize"))? + as u64; while decoded_bytes < target { let code = codes[code_idx] as usize; decoded_bytes += u64::from(dict_offsets[code + 1] - dict_offsets[code]); diff --git a/encodings/experimental/onpair/src/kernel.rs b/encodings/experimental/onpair/src/kernel.rs index 99fb5bf2d14..fdd521e887e 100644 --- a/encodings/experimental/onpair/src/kernel.rs +++ b/encodings/experimental/onpair/src/kernel.rs @@ -6,6 +6,6 @@ use vortex_array::kernel::ParentKernelSet; use crate::OnPair; -// TODO: implement TakeExecute for OnPair +// TODO: implement ListExecute & TakeExecute for OnPair pub(super) const PARENT_KERNELS: ParentKernelSet = ParentKernelSet::new(&[ParentKernelSet::lift(&FilterExecuteAdaptor(OnPair))]); From da168c89efbe54b5ac60ed9586b4eee268e435ba Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Sat, 30 May 2026 00:13:55 +0100 Subject: [PATCH 27/27] fix Signed-off-by: Joe Isaacs --- Cargo.lock | 2 +- Cargo.toml | 2 +- encodings/experimental/onpair/public-api.lock | 8 ++++++++ encodings/experimental/onpair/src/lib.rs | 4 ++++ 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fbf10e8c9c8..374306195f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5726,7 +5726,7 @@ checksum = "cfe21416a02c693fb9f980befcb230ecc70b0b3d1cc4abf88b9675c4c1457f0c" [[package]] name = "onpair" version = "0.0.3" -source = "git+https://github.com/spiraldb/onpair.git?rev=8d4009dc5a2c31251b9c2d657078fe8b09d3a02c#8d4009dc5a2c31251b9c2d657078fe8b09d3a02c" +source = "git+https://github.com/spiraldb/onpair.git?rev=53e8ca6081d377e9933d999cef286e26bf52e2c7#53e8ca6081d377e9933d999cef286e26bf52e2c7" dependencies = [ "hashbrown 0.16.1", "rand 0.9.4", diff --git a/Cargo.toml b/Cargo.toml index 1aeccacc9ba..fe69b2370e7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -190,7 +190,7 @@ num_enum = { version = "0.7.3", default-features = false } object_store = { version = "0.13.1", default-features = false } once_cell = "1.21" oneshot = { version = "0.2.0", features = ["async"] } -onpair = { version = "0.0.3", git = "https://github.com/spiraldb/onpair.git", rev = "8d4009dc5a2c31251b9c2d657078fe8b09d3a02c" } +onpair = { version = "0.0.3", git = "https://github.com/spiraldb/onpair.git", rev = "53e8ca6081d377e9933d999cef286e26bf52e2c7" } opentelemetry = "0.32.0" opentelemetry-otlp = "0.32.0" opentelemetry_sdk = "0.32.0" diff --git a/encodings/experimental/onpair/public-api.lock b/encodings/experimental/onpair/public-api.lock index 6d1618dceb9..1571f6b5d67 100644 --- a/encodings/experimental/onpair/public-api.lock +++ b/encodings/experimental/onpair/public-api.lock @@ -1,5 +1,13 @@ pub mod vortex_onpair +pub use vortex_onpair::Bits + +pub use vortex_onpair::Config + +pub use vortex_onpair::OnPairError + +pub use vortex_onpair::Threshold + pub struct vortex_onpair::OnPair impl vortex_onpair::OnPair diff --git a/encodings/experimental/onpair/src/lib.rs b/encodings/experimental/onpair/src/lib.rs index fa90ac9acb1..94c18b6dec8 100644 --- a/encodings/experimental/onpair/src/lib.rs +++ b/encodings/experimental/onpair/src/lib.rs @@ -23,3 +23,7 @@ mod tests; pub use array::*; pub use compress::*; +pub use onpair::Bits; +pub use onpair::Config; +pub use onpair::Error as OnPairError; +pub use onpair::Threshold;