From f8b218986cf75a171b7bd51717ef01e106bcd4eb Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 11 Apr 2026 12:30:26 +0000 Subject: [PATCH 01/18] vortex-tensor: promote vector-search helpers to public API Promote `compress_turboquant`, `build_constant_query_vector`, and `build_similarity_search_tree` from the bench-only `similarity_search_common` module into a new public `vortex_tensor::vector_search` module so downstream benchmark crates (and library users doing brute-force vector search) can reuse them without duplicating the canonical recipe. The existing similarity_search bench now delegates to these helpers instead of inlining them, and adds three unit tests for the new public surface: constant-query vector construction, end-to-end tree execution to a BoolArray, and a TurboQuant-roundtrip ranking check. Signed-off-by: Claude Signed-off-by: Connor Tsui --- .../benches/similarity_search_common/mod.rs | 93 +----- vortex-tensor/public-api.lock | 8 + vortex-tensor/src/lib.rs | 2 + vortex-tensor/src/vector_search.rs | 301 ++++++++++++++++++ 4 files changed, 321 insertions(+), 83 deletions(-) create mode 100644 vortex-tensor/src/vector_search.rs diff --git a/vortex-tensor/benches/similarity_search_common/mod.rs b/vortex-tensor/benches/similarity_search_common/mod.rs index c22cb5a9f08..ee0bab128ed 100644 --- a/vortex-tensor/benches/similarity_search_common/mod.rs +++ b/vortex-tensor/benches/similarity_search_common/mod.rs @@ -30,22 +30,14 @@ use vortex_array::ArrayRef; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::VortexSessionExecute; -use vortex_array::arrays::ConstantArray; use vortex_array::arrays::Extension; use vortex_array::arrays::ExtensionArray; use vortex_array::arrays::FixedSizeListArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::extension::ExtensionArrayExt; use vortex_array::arrays::fixed_size_list::FixedSizeListArrayExt; -use vortex_array::arrays::scalar_fn::ScalarFnArrayExt; -use vortex_array::builtins::ArrayBuiltins; -use vortex_array::dtype::DType; -use vortex_array::dtype::Nullability; -use vortex_array::dtype::PType; use vortex_array::dtype::extension::ExtDType; use vortex_array::extension::EmptyMetadata; -use vortex_array::scalar::Scalar; -use vortex_array::scalar_fn::fns::operators::Operator; use vortex_array::session::ArraySession; use vortex_array::validity::Validity; use vortex_btrblocks::BtrBlocksCompressor; @@ -54,12 +46,9 @@ use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_panic; use vortex_session::VortexSession; -use vortex_tensor::encodings::turboquant::TurboQuantConfig; -use vortex_tensor::encodings::turboquant::turboquant_encode_unchecked; -use vortex_tensor::scalar_fns::cosine_similarity::CosineSimilarity; -use vortex_tensor::scalar_fns::l2_denorm::L2Denorm; -use vortex_tensor::scalar_fns::l2_denorm::normalize_as_l2_denorm; use vortex_tensor::vector::Vector; +use vortex_tensor::vector_search::build_similarity_search_tree as public_build_similarity_search_tree; +use vortex_tensor::vector_search::compress_turboquant as public_compress_turboquant; /// A shared [`VortexSession`] pre-loaded with the builtin [`ArraySession`] so both bench and /// example can create execution contexts cheaply. @@ -146,25 +135,6 @@ pub fn extract_row_as_query(vectors: &ArrayRef, row: usize, dim: u32) -> Vec` extension array whose storage is a [`ConstantArray`] broadcasting a -/// single query vector across `num_rows` rows. This is how we hand a single query vector to -/// `CosineSimilarity` on the `rhs` side -- `ScalarFnArray` requires both children to have the -/// same length, so we broadcast the query instead of hand-rolling a 1-row input. -fn build_constant_query_vector(query: &[f32], num_rows: usize) -> VortexResult { - let element_dtype = DType::Primitive(PType::F32, Nullability::NonNullable); - - let children: Vec = query - .iter() - .map(|&v| Scalar::primitive(v, Nullability::NonNullable)) - .collect(); - let storage_scalar = Scalar::fixed_size_list(element_dtype, children, Nullability::NonNullable); - - let storage = ConstantArray::new(storage_scalar, num_rows).into_array(); - - let ext_dtype = ExtDType::::try_new(EmptyMetadata, storage.dtype().clone())?.erased(); - Ok(ExtensionArray::new(ext_dtype, storage).into_array()) -} - /// Compresses a raw `Vector` array with the default BtrBlocks pipeline. /// /// [`BtrBlocksCompressor`] walks into the extension array and recursively compresses the @@ -175,36 +145,11 @@ pub fn compress_default(data: ArrayRef) -> VortexResult { BtrBlocksCompressor::default().compress(&data) } -/// Compresses a raw `Vector` array with the TurboQuant pipeline by hand, producing the -/// same tree shape that -/// [`vortex_tensor::encodings::turboquant::TurboQuantScheme`] would: -/// -/// ```text -/// L2Denorm(SorfTransform(FSL(Dict(codes, centroids))), norms) -/// ``` -/// -/// Calling the encode helpers directly (instead of going through -/// `BtrBlocksCompressorBuilder::with_turboquant()`) lets this example avoid depending on the -/// `unstable_encodings` feature flag. -/// -/// See `vortex-tensor/src/encodings/turboquant/tests/mod.rs::normalize_and_encode` for the same -/// canonical recipe. +/// Compresses a raw `Vector` array with the TurboQuant pipeline. This is a thin +/// wrapper around [`vortex_tensor::vector_search::compress_turboquant`] preserved for bench +/// call-site compatibility. pub fn compress_turboquant(data: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { - let l2_denorm = normalize_as_l2_denorm(data, ctx)?; - let normalized = l2_denorm.child_at(0).clone(); - let norms = l2_denorm.child_at(1).clone(); - let num_rows = l2_denorm.len(); - - let normalized_ext = normalized - .as_opt::() - .vortex_expect("normalized child should be an Extension array"); - - let config = TurboQuantConfig::default(); - // SAFETY: `normalize_as_l2_denorm` guarantees every row is unit-norm (or zero), which is the - // invariant `turboquant_encode_unchecked` expects. - let tq = unsafe { turboquant_encode_unchecked(normalized_ext, &config, ctx) }?; - - Ok(unsafe { L2Denorm::new_array_unchecked(tq, norms, num_rows) }?.into_array()) + public_compress_turboquant(data, ctx) } /// Dispatch helper that builds the data array for the requested [`Variant`], starting from a @@ -226,31 +171,13 @@ pub fn build_variant( } /// Build the lazy similarity-search array tree for a prepared data array and a single query -/// vector. The returned tree is a boolean array of length `data.len()` where position `i` is -/// `true` iff `cosine_similarity(data[i], query) > threshold`. -/// -/// The tree shape is: -/// -/// ```text -/// Binary(Gt, [ -/// CosineSimilarity([data, ConstantArray(query_vec, n)]), -/// ConstantArray(threshold, n), -/// ]) -/// ``` -/// -/// This function does no execution; it is safe to call inside a benchmark setup closure. +/// vector. Thin wrapper around +/// [`vortex_tensor::vector_search::build_similarity_search_tree`] preserved for bench +/// call-site compatibility. pub fn build_similarity_search_tree( data: ArrayRef, query: &[f32], threshold: f32, ) -> VortexResult { - let num_rows = data.len(); - let query_vec = build_constant_query_vector(query, num_rows)?; - - let cosine = CosineSimilarity::try_new_array(data, query_vec, num_rows)?.into_array(); - - let threshold_scalar = Scalar::primitive(threshold, Nullability::NonNullable); - let threshold_array = ConstantArray::new(threshold_scalar, num_rows).into_array(); - - cosine.binary(threshold_array, Operator::Gt) + public_build_similarity_search_tree(data, query, threshold) } diff --git a/vortex-tensor/public-api.lock b/vortex-tensor/public-api.lock index bec8df1cb29..90aea23a194 100644 --- a/vortex-tensor/public-api.lock +++ b/vortex-tensor/public-api.lock @@ -550,4 +550,12 @@ impl core::marker::Copy for vortex_tensor::vector::VectorMatcherMetadata impl core::marker::StructuralPartialEq for vortex_tensor::vector::VectorMatcherMetadata +pub mod vortex_tensor::vector_search + +pub fn vortex_tensor::vector_search::build_constant_query_vector(query: &[f32], num_rows: usize) -> vortex_error::VortexResult + +pub fn vortex_tensor::vector_search::build_similarity_search_tree(data: vortex_array::array::erased::ArrayRef, query: &[f32], threshold: f32) -> vortex_error::VortexResult + +pub fn vortex_tensor::vector_search::compress_turboquant(data: vortex_array::array::erased::ArrayRef, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + pub fn vortex_tensor::initialize(session: &vortex_session::VortexSession) diff --git a/vortex-tensor/src/lib.rs b/vortex-tensor/src/lib.rs index 3d3563aa8e4..b3cf6c21695 100644 --- a/vortex-tensor/src/lib.rs +++ b/vortex-tensor/src/lib.rs @@ -25,6 +25,8 @@ pub mod vector; pub mod encodings; +pub mod vector_search; + mod utils; /// Initialize the Vortex tensor library with a Vortex session. diff --git a/vortex-tensor/src/vector_search.rs b/vortex-tensor/src/vector_search.rs new file mode 100644 index 00000000000..37adbf27a61 --- /dev/null +++ b/vortex-tensor/src/vector_search.rs @@ -0,0 +1,301 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Reusable helpers for building brute-force vector similarity search expressions over +//! [`Vector`] extension arrays. +//! +//! This module exposes three small building blocks that together make it straightforward to +//! stand up a cosine-similarity-plus-threshold scan on top of a prepared data array: +//! +//! - [`compress_turboquant`] applies the canonical TurboQuant encoding pipeline +//! (`L2Denorm(SorfTransform(FSL(Dict(codes, centroids))), norms)`) to a raw +//! `Vector` array without requiring the caller to plumb the +//! `unstable_encodings` feature flag on the `vortex` facade. +//! - [`build_constant_query_vector`] wraps a single query vector into a +//! [`Vector`] extension array whose storage is a [`ConstantArray`] broadcast +//! across `num_rows` rows. This is the shape expected by +//! [`CosineSimilarity::try_new_array`] for the RHS of a database-vs-query scan. +//! - [`build_similarity_search_tree`] wires everything together into a lazy +//! `Binary(Gt, [CosineSimilarity(data, query), threshold])` expression. +//! +//! Executing the tree from [`build_similarity_search_tree`] into a +//! [`BoolArray`](vortex_array::arrays::BoolArray) yields one boolean per row indicating whether +//! that row's cosine similarity to the query exceeds `threshold`. +//! +//! # Example +//! +//! ```ignore +//! use vortex_array::{ArrayRef, VortexSessionExecute}; +//! use vortex_array::arrays::BoolArray; +//! use vortex_session::VortexSession; +//! use vortex_tensor::vector_search::{build_similarity_search_tree, compress_turboquant}; +//! +//! fn run(session: &VortexSession, data: ArrayRef, query: &[f32]) -> anyhow::Result<()> { +//! let mut ctx = session.create_execution_ctx(); +//! let data = compress_turboquant(data, &mut ctx)?; +//! let tree = build_similarity_search_tree(data, query, 0.8)?; +//! let _matches: BoolArray = tree.execute(&mut ctx)?; +//! Ok(()) +//! } +//! ``` +//! +//! [`Vector`]: crate::vector::Vector +//! [`CosineSimilarity::try_new_array`]: crate::scalar_fns::cosine_similarity::CosineSimilarity::try_new_array + +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::Extension; +use vortex_array::arrays::ExtensionArray; +use vortex_array::arrays::scalar_fn::ScalarFnArrayExt; +use vortex_array::builtins::ArrayBuiltins; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::dtype::extension::ExtDType; +use vortex_array::extension::EmptyMetadata; +use vortex_array::scalar::Scalar; +use vortex_array::scalar_fn::fns::operators::Operator; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; + +use crate::encodings::turboquant::TurboQuantConfig; +use crate::encodings::turboquant::turboquant_encode_unchecked; +use crate::scalar_fns::cosine_similarity::CosineSimilarity; +use crate::scalar_fns::l2_denorm::L2Denorm; +use crate::scalar_fns::l2_denorm::normalize_as_l2_denorm; +use crate::vector::Vector; + +/// Apply the canonical TurboQuant encoding pipeline to a `Vector` array. +/// +/// The returned array has the shape +/// `L2Denorm(SorfTransform(FSL(Dict(codes, centroids))), norms)` — exactly what +/// [`vortex_tensor::encodings::turboquant::TurboQuantScheme`] produces when invoked through +/// `BtrBlocksCompressorBuilder::with_turboquant()`, but without requiring callers to enable +/// the `unstable_encodings` feature on the `vortex` facade. +/// +/// The input `data` must be a [`Vector`] extension array whose element type is `f32` and whose +/// dimensionality is at least +/// [`turboquant::MIN_DIMENSION`](crate::encodings::turboquant::MIN_DIMENSION). The TurboQuant +/// configuration used is [`TurboQuantConfig::default()`] (8-bit codes, 3 SORF rounds, seed 42). +/// +/// # Errors +/// +/// Returns an error if `data` is not a [`Vector`] extension array, if normalization fails, or +/// if the underlying TurboQuant encoder rejects the input shape. +pub fn compress_turboquant(data: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { + let l2_denorm = normalize_as_l2_denorm(data, ctx)?; + let normalized = l2_denorm.child_at(0).clone(); + let norms = l2_denorm.child_at(1).clone(); + let num_rows = l2_denorm.len(); + + let Some(normalized_ext) = normalized.as_opt::() else { + vortex_bail!("normalize_as_l2_denorm must produce an Extension array child"); + }; + + let config = TurboQuantConfig::default(); + // SAFETY: `normalize_as_l2_denorm` guarantees every row is unit-norm (or zero), which is + // the invariant `turboquant_encode_unchecked` expects. + let tq = unsafe { turboquant_encode_unchecked(normalized_ext, &config, ctx) }?; + + Ok(unsafe { L2Denorm::new_array_unchecked(tq, norms, num_rows) }?.into_array()) +} + +/// Build a `Vector` extension array whose storage is a [`ConstantArray`] broadcasting +/// a single query vector across `num_rows` rows. +/// +/// This is the shape expected for the RHS of a database-vs-query +/// [`CosineSimilarity`](crate::scalar_fns::cosine_similarity::CosineSimilarity) scan: the +/// `ScalarFnArray` contract requires both children to have the same length, so rather than +/// hand-rolling a 1-row input we broadcast the query across the whole database. +/// +/// # Errors +/// +/// Returns an error if the [`Vector`] extension dtype rejects the constructed storage dtype. +pub fn build_constant_query_vector(query: &[f32], num_rows: usize) -> VortexResult { + let element_dtype = DType::Primitive(PType::F32, Nullability::NonNullable); + + let children: Vec = query + .iter() + .map(|&v| Scalar::primitive(v, Nullability::NonNullable)) + .collect(); + let storage_scalar = Scalar::fixed_size_list(element_dtype, children, Nullability::NonNullable); + + let storage = ConstantArray::new(storage_scalar, num_rows).into_array(); + + let ext_dtype = ExtDType::::try_new(EmptyMetadata, storage.dtype().clone())?.erased(); + Ok(ExtensionArray::new(ext_dtype, storage).into_array()) +} + +/// Build the lazy similarity-search expression tree for a prepared database array and a +/// single query vector. +/// +/// The returned array is a lazy boolean expression of length `data.len()` whose position `i` +/// is `true` iff `cosine_similarity(data[i], query) > threshold`. Executing it into a +/// [`BoolArray`](vortex_array::arrays::BoolArray) runs the full scan. +/// +/// The tree shape is: +/// +/// ```text +/// Binary(Gt, [ +/// CosineSimilarity([data, ConstantArray(query_vec, n)]), +/// ConstantArray(threshold, n), +/// ]) +/// ``` +/// +/// This function performs no execution; it is safe to call inside a benchmark setup closure. +/// +/// # Errors +/// +/// Returns an error if `query` has a length incompatible with `data`'s vector dimension, or +/// if any of the intermediate array constructors fails. +pub fn build_similarity_search_tree( + data: ArrayRef, + query: &[f32], + threshold: f32, +) -> VortexResult { + let num_rows = data.len(); + let query_vec = build_constant_query_vector(query, num_rows)?; + + let cosine = CosineSimilarity::try_new_array(data, query_vec, num_rows) + .vortex_expect("cosine similarity accepts two matching Vector extension arrays") + .into_array(); + + let threshold_scalar = Scalar::primitive(threshold, Nullability::NonNullable); + let threshold_array = ConstantArray::new(threshold_scalar, num_rows).into_array(); + + cosine.binary(threshold_array, Operator::Gt) +} + +#[cfg(test)] +mod tests { + use vortex_array::ArrayRef; + use vortex_array::IntoArray; + use vortex_array::VortexSessionExecute; + use vortex_array::arrays::BoolArray; + use vortex_array::arrays::Extension; + use vortex_array::arrays::ExtensionArray; + use vortex_array::arrays::FixedSizeListArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::arrays::bool::BoolArrayExt; + use vortex_array::dtype::extension::ExtDType; + use vortex_array::extension::EmptyMetadata; + use vortex_array::session::ArraySession; + use vortex_array::validity::Validity; + use vortex_buffer::BufferMut; + use vortex_error::VortexResult; + use vortex_session::VortexSession; + + use super::build_constant_query_vector; + use super::build_similarity_search_tree; + use super::compress_turboquant; + use crate::vector::Vector; + + /// Build a `Vector` extension array from a flat f32 slice. Each contiguous + /// group of `DIM` values becomes one row. + fn vector_array(dim: u32, values: &[f32]) -> VortexResult { + let dim_usize = dim as usize; + assert_eq!(values.len() % dim_usize, 0); + let num_rows = values.len() / dim_usize; + + let mut buf = BufferMut::::with_capacity(values.len()); + for &v in values { + buf.push(v); + } + let elements = PrimitiveArray::new::(buf.freeze(), Validity::NonNullable); + let fsl = FixedSizeListArray::try_new( + elements.into_array(), + dim, + Validity::NonNullable, + num_rows, + )?; + + let ext_dtype = ExtDType::::try_new(EmptyMetadata, fsl.dtype().clone())?.erased(); + Ok(ExtensionArray::new(ext_dtype, fsl.into_array()).into_array()) + } + + fn test_session() -> VortexSession { + VortexSession::empty().with::() + } + + #[test] + fn constant_query_vector_has_vector_extension_dtype() -> VortexResult<()> { + let query = vec![1.0f32, 0.0, 0.0, 0.0]; + let rhs = build_constant_query_vector(&query, 5)?; + + assert_eq!(rhs.len(), 5); + assert!(rhs.as_opt::().is_some()); + Ok(()) + } + + #[test] + fn similarity_search_tree_executes_to_bool_array() -> VortexResult<()> { + // 4 rows of 3-dim vectors; the first and last match the query [1, 0, 0]. + let data = vector_array( + 3, + &[ + 1.0, 0.0, 0.0, // + 0.0, 1.0, 0.0, // + 0.0, 0.0, 1.0, // + 1.0, 0.0, 0.0, // + ], + )?; + let query = [1.0f32, 0.0, 0.0]; + + let tree = build_similarity_search_tree(data, &query, 0.5)?; + let mut ctx = test_session().create_execution_ctx(); + let result: BoolArray = tree.execute(&mut ctx)?; + + let bits = result.to_bit_buffer(); + assert_eq!(bits.len(), 4); + assert!(bits.value(0)); + assert!(!bits.value(1)); + assert!(!bits.value(2)); + assert!(bits.value(3)); + Ok(()) + } + + #[test] + fn turboquant_roundtrip_preserves_ranking() -> VortexResult<()> { + // Build 6 rows of 128-dim vectors where row 0 is highly correlated with the query. + // TurboQuant should preserve the "row 0 is the best match" ordering. + const DIM: u32 = 128; + const NUM_ROWS: usize = 6; + + let mut values = Vec::::with_capacity(NUM_ROWS * DIM as usize); + let query: Vec = (0..DIM as usize) + .map(|i| ((i as f32) * 0.017).sin()) + .collect(); + + // Row 0: identical to query (cosine=1.0) + values.extend_from_slice(&query); + // Row 1: query + noise + for (i, q) in query.iter().enumerate() { + values.push(q + 0.05 * ((i as f32) * 0.03).cos()); + } + // Rows 2..6: unrelated patterns + for row in 2..NUM_ROWS { + for i in 0..DIM as usize { + values.push(((row as f32 * 1.3 + i as f32) * 0.07).sin()); + } + } + + let data = vector_array(DIM, &values)?; + let mut ctx = test_session().create_execution_ctx(); + let compressed = compress_turboquant(data, &mut ctx)?; + assert_eq!(compressed.len(), NUM_ROWS); + + // Build a tree with a low threshold so row 0 (cosine=1.0 exact) matches. + let tree = build_similarity_search_tree(compressed, &query, 0.95)?; + let result: BoolArray = tree.execute(&mut ctx)?; + let bits = result.to_bit_buffer(); + assert_eq!(bits.len(), NUM_ROWS); + assert!( + bits.value(0), + "row 0 (identical to query) must match at threshold 0.95 even after TurboQuant" + ); + Ok(()) + } +} From b30bba29d6543b20f1e8d028fd0164961c0cbcbe Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 11 Apr 2026 12:40:28 +0000 Subject: [PATCH 02/18] vortex-bench: add VortexTurboQuant format and list_to_vector_ext utility Adds `Format::VortexTurboQuant` as a new Format enum variant, parallel to `VortexCompact`, so downstream dashboards can distinguish TurboQuant-encoded Vortex results from the generic BtrBlocks-compressed OnDiskVortex layout via the existing `Target { engine, format }` grouping key. Also adds `vortex_bench::conversions::list_to_vector_ext`: a utility that rewraps a `List` column (the shape an `emb` column takes when ingested from parquet via `parquet_to_vortex_chunks`) as `Extension(FixedSizeList)`. This is the only piece of conversion glue needed to bring a VectorDBBench-style parquet embedding column into a form the vector-search scalar functions expect. It handles both single-list and chunked inputs, validates uniform stride, and rejects nullable, non-float, or mismatched-length inputs. `vortex-bench` gains a direct dependency on `vortex-tensor` for the `Vector` extension type. Signed-off-by: Claude Signed-off-by: Connor Tsui --- Cargo.lock | 1 + vortex-bench/Cargo.toml | 1 + vortex-bench/src/conversions.rs | 238 +++++++++++++++++++++++++++++++ vortex-bench/src/lib.rs | 9 ++ vortex-bench/src/measurements.rs | 9 +- 5 files changed, 256 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5ce1bc853cc..424eebac8f6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10241,6 +10241,7 @@ dependencies = [ "url", "uuid", "vortex", + "vortex-tensor", ] [[package]] diff --git a/vortex-bench/Cargo.toml b/vortex-bench/Cargo.toml index 62d302f12e1..1859a64a16e 100644 --- a/vortex-bench/Cargo.toml +++ b/vortex-bench/Cargo.toml @@ -63,3 +63,4 @@ vortex = { workspace = true, features = [ "tokio", "zstd", ] } +vortex-tensor = { workspace = true } diff --git a/vortex-bench/src/conversions.rs b/vortex-bench/src/conversions.rs index 3f21ab30ba0..13e295e13c4 100644 --- a/vortex-bench/src/conversions.rs +++ b/vortex-bench/src/conversions.rs @@ -21,17 +21,28 @@ use vortex::VortexSessionDefault; use vortex::array::ArrayRef; use vortex::array::IntoArray; use vortex::array::VortexSessionExecute; +use vortex::array::arrays::Chunked; use vortex::array::arrays::ChunkedArray; +use vortex::array::arrays::ExtensionArray; +use vortex::array::arrays::FixedSizeListArray; +use vortex::array::arrays::List; +use vortex::array::arrays::chunked::ChunkedArrayExt; +use vortex::array::arrays::list::ListArrayExt; use vortex::array::arrow::FromArrowArray; use vortex::array::builders::builder_with_capacity; +use vortex::array::extension::EmptyMetadata; use vortex::array::stream::ArrayStreamAdapter; use vortex::array::stream::ArrayStreamExt; +use vortex::array::validity::Validity; use vortex::dtype::DType; use vortex::dtype::arrow::FromArrowType; +use vortex::dtype::extension::ExtDType; use vortex::error::VortexResult; +use vortex::error::vortex_bail; use vortex::error::vortex_err; use vortex::file::WriteOptionsSessionExt; use vortex::session::VortexSession; +use vortex_tensor::vector::Vector; use crate::CompactionStrategy; use crate::Format; @@ -222,3 +233,230 @@ pub async fn write_parquet_as_vortex( }) .await } + +/// Rewrap a list-of-float column as a [`vortex_tensor::vector::Vector`] extension array. +/// +/// Parquet has no fixed-size list logical type, so an embedding column ingested via +/// [`parquet_to_vortex_chunks`] arrives as `List` (or `List` / `List`) even +/// when every row has the same length. This helper validates that every list in `input` +/// has the same length `D` and reconstructs the column as +/// `Extension(FixedSizeList)` — the shape expected by the vector-search +/// scalar functions in `vortex-tensor`. +/// +/// The input may be either a single [`List`] array or a [`Chunked`] array of lists (the +/// common case after `parquet_to_vortex_chunks`). Chunked inputs are converted chunk-by-chunk +/// and reassembled as a [`ChunkedArray`] of `Extension`. +/// +/// # Errors +/// +/// Returns an error if: +/// - `input` is not a `List` or `Chunked` array. +/// - The element type is not a non-nullable float primitive (`f16`, `f32`, or `f64`). +/// - Any row has a different length than the first row. +/// - The list validity is nullable (vector elements cannot be null at the row level). +/// - The input has zero rows (the dimension cannot be inferred from empty input). +pub fn list_to_vector_ext(input: ArrayRef) -> VortexResult { + if let Some(chunked) = input.as_opt::() { + let converted: Vec = chunked + .iter_chunks() + .map(|chunk| list_to_vector_ext(chunk.clone())) + .collect::>()?; + if converted.is_empty() { + vortex_bail!("list_to_vector_ext: chunked input has no chunks"); + } + return Ok(ChunkedArray::from_iter(converted).into_array()); + } + + let Some(list) = input.as_opt::() else { + vortex_bail!( + "list_to_vector_ext expects a List array, got dtype {}", + input.dtype() + ); + }; + + if !matches!( + list.list_validity(), + Validity::NonNullable | Validity::AllValid + ) { + vortex_bail!( + "list_to_vector_ext: list rows must be non-nullable for Vector extension wrapping" + ); + } + + let element_dtype = list.element_dtype().clone(); + let DType::Primitive(ptype, elem_nullability) = &element_dtype else { + vortex_bail!( + "list_to_vector_ext: element dtype must be a primitive float, got {}", + element_dtype + ); + }; + if !ptype.is_float() { + vortex_bail!( + "list_to_vector_ext: element type must be float (f16/f32/f64), got {}", + ptype + ); + } + if elem_nullability.is_nullable() { + vortex_bail!( + "list_to_vector_ext: element type must be non-nullable, got nullable {}", + ptype + ); + } + + let num_rows = input.len(); + if num_rows == 0 { + vortex_bail!("list_to_vector_ext: cannot infer vector dimension from empty input"); + } + + let first_start = list.offset_at(0)?; + let first_end = list.offset_at(1)?; + let dim = first_end.checked_sub(first_start).ok_or_else(|| { + vortex_err!("list_to_vector_ext: offsets are not monotonically increasing") + })?; + if dim == 0 { + vortex_bail!("list_to_vector_ext: first row has zero elements"); + } + + for i in 1..num_rows { + let start = list.offset_at(i)?; + let end = list.offset_at(i + 1)?; + let row_len = end.checked_sub(start).ok_or_else(|| { + vortex_err!("list_to_vector_ext: offsets are not monotonically increasing") + })?; + if row_len != dim { + vortex_bail!( + "list_to_vector_ext: row {} has length {} but expected {}", + i, + row_len, + dim + ); + } + } + + let elements = list.sliced_elements()?; + let expected_elements = num_rows + .checked_mul(dim) + .ok_or_else(|| vortex_err!("list_to_vector_ext: num_rows * dim overflows usize"))?; + if elements.len() != expected_elements { + vortex_bail!( + "list_to_vector_ext: elements buffer has length {} but expected {}", + elements.len(), + expected_elements + ); + } + + let dim_u32 = u32::try_from(dim) + .map_err(|_| vortex_err!("list_to_vector_ext: dimension {dim} does not fit in u32"))?; + + let fsl = FixedSizeListArray::try_new(elements, dim_u32, Validity::NonNullable, num_rows)?; + let ext_dtype = ExtDType::::try_new(EmptyMetadata, fsl.dtype().clone())?.erased(); + Ok(ExtensionArray::new(ext_dtype, fsl.into_array()).into_array()) +} + +#[cfg(test)] +mod tests { + use vortex::array::IntoArray; + use vortex::array::arrays::Extension; + use vortex::array::arrays::List; + use vortex::array::arrays::PrimitiveArray; + use vortex::array::arrays::extension::ExtensionArrayExt; + use vortex::array::validity::Validity; + use vortex::buffer::BufferMut; + use vortex::dtype::DType; + use vortex::dtype::Nullability; + use vortex::dtype::PType; + + use super::list_to_vector_ext; + + fn list_f32(rows: &[&[f32]]) -> vortex::array::ArrayRef { + let mut elements = BufferMut::::with_capacity(rows.iter().map(|r| r.len()).sum()); + let mut offsets = BufferMut::::with_capacity(rows.len() + 1); + offsets.push(0); + for row in rows { + for &v in row.iter() { + elements.push(v); + } + offsets.push(i32::try_from(elements.len()).unwrap()); + } + + let elements_array = + PrimitiveArray::new::(elements.freeze(), Validity::NonNullable).into_array(); + let offsets_array = + PrimitiveArray::new::(offsets.freeze(), Validity::NonNullable).into_array(); + vortex::array::Array::::new(elements_array, offsets_array, Validity::NonNullable) + .into_array() + } + + #[test] + fn uniform_list_becomes_vector_extension() { + let list = list_f32(&[&[1.0, 2.0, 3.0], &[4.0, 5.0, 6.0], &[7.0, 8.0, 9.0]]); + let wrapped = list_to_vector_ext(list).unwrap(); + assert_eq!(wrapped.len(), 3); + let ext = wrapped.as_opt::().expect("returns Extension"); + assert!(matches!( + ext.storage_array().dtype(), + DType::FixedSizeList(_, 3, _) + )); + } + + #[test] + fn mismatched_row_length_is_rejected() { + let list = list_f32(&[&[1.0, 2.0, 3.0], &[4.0, 5.0]]); + let err = list_to_vector_ext(list).unwrap_err().to_string(); + assert!( + err.contains("row 1 has length 2 but expected 3"), + "unexpected error: {err}", + ); + } + + #[test] + fn non_list_input_is_rejected() { + let primitive = PrimitiveArray::new::( + BufferMut::::from_iter([1.0f32, 2.0, 3.0]).freeze(), + Validity::NonNullable, + ) + .into_array(); + let err = list_to_vector_ext(primitive).unwrap_err().to_string(); + assert!( + err.contains("expects a List array"), + "unexpected error: {err}" + ); + } + + #[test] + fn empty_input_is_rejected() { + let list = list_f32(&[]); + let err = list_to_vector_ext(list).unwrap_err().to_string(); + assert!( + err.contains("cannot infer vector dimension from empty input"), + "unexpected error: {err}", + ); + } + + #[test] + fn non_float_element_type_is_rejected() { + // Build a List. + let elements = PrimitiveArray::new::( + BufferMut::::from_iter([1i32, 2, 3, 4]).freeze(), + Validity::NonNullable, + ) + .into_array(); + let offsets = PrimitiveArray::new::( + BufferMut::::from_iter([0i32, 2, 4]).freeze(), + Validity::NonNullable, + ) + .into_array(); + let list = vortex::array::Array::::new(elements, offsets, Validity::NonNullable) + .into_array(); + + let err = list_to_vector_ext(list).unwrap_err().to_string(); + assert!( + err.contains("element type must be float"), + "unexpected error: {err}", + ); + + // The unused Nullability / PType imports exist to make the intent clear; suppress + // the dead-code warning by referencing them in a no-op expression. + let _ = (Nullability::NonNullable, PType::I32); + } +} diff --git a/vortex-bench/src/lib.rs b/vortex-bench/src/lib.rs index af0d3fdef30..b766e2e6a4c 100644 --- a/vortex-bench/src/lib.rs +++ b/vortex-bench/src/lib.rs @@ -138,6 +138,13 @@ pub enum Format { #[clap(name = "vortex-compact")] #[serde(rename = "vortex-compact")] VortexCompact, + /// Vortex file with the TurboQuant lossy vector-quantization encoding applied to + /// [`vortex_tensor::vector::Vector`] columns. Used by the vector-search benchmark to + /// distinguish TurboQuant-encoded results from the generic BtrBlocks-compressed + /// [`Format::OnDiskVortex`] layout in downstream dashboards. + #[clap(name = "vortex-turboquant")] + #[serde(rename = "vortex-turboquant")] + VortexTurboQuant, #[clap(name = "duckdb")] #[serde(rename = "duckdb")] OnDiskDuckDB, @@ -177,6 +184,7 @@ impl Format { Format::Parquet => "parquet", Format::OnDiskVortex => "vortex-file-compressed", Format::VortexCompact => "vortex-compact", + Format::VortexTurboQuant => "vortex-turboquant", Format::OnDiskDuckDB => "duckdb", Format::Lance => "lance", } @@ -189,6 +197,7 @@ impl Format { Format::Parquet => "parquet", Format::OnDiskVortex => "vortex", Format::VortexCompact => "vortex", + Format::VortexTurboQuant => "vortex", Format::OnDiskDuckDB => "duckdb", Format::Lance => "lance", } diff --git a/vortex-bench/src/measurements.rs b/vortex-bench/src/measurements.rs index f49349cd95e..8629d38d598 100644 --- a/vortex-bench/src/measurements.rs +++ b/vortex-bench/src/measurements.rs @@ -348,10 +348,13 @@ impl ToJson for CompressionTimingMeasurement { fn to_json(&self) -> serde_json::Value { let (name, engine) = match self.format { Format::OnDiskVortex => (self.name.to_string(), Engine::Vortex), + Format::VortexTurboQuant => { + (format!("vortex-turboquant {}", self.name), Engine::Vortex) + } Format::Parquet => (format!("parquet_rs-zstd {}", self.name), Engine::Arrow), Format::Lance => (format!("lance {}", self.name), Engine::Arrow), _ => vortex_panic!( - "CompressionTimingMeasurement only supports vortex, lance, and parquet formats" + "CompressionTimingMeasurement only supports vortex, vortex-turboquant, lance, and parquet formats" ), }; @@ -392,7 +395,9 @@ pub struct CustomUnitMeasurement { impl ToJson for CustomUnitMeasurement { fn to_json(&self) -> serde_json::Value { let engine = match self.format { - Format::OnDiskVortex | Format::VortexCompact => Engine::Vortex, + Format::OnDiskVortex | Format::VortexCompact | Format::VortexTurboQuant => { + Engine::Vortex + } Format::Parquet => Engine::Arrow, Format::Lance => Engine::Arrow, _ => Engine::Vortex, // Default to Vortex for other formats. From 4a9f5e1b20da24c01dba48d9a1c59e8a4c582896 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 11 Apr 2026 12:42:17 +0000 Subject: [PATCH 03/18] vortex-bench: add VectorDataset for vector-search benchmark datasets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce `VectorDataset`, a new `Dataset`-trait implementor that wires public VectorDBBench-hosted embedding corpora into the vortex-bench data-management machinery. The first variant is `CohereSmall` (Cohere wiki-22-12, 100K rows × 768 dims, cosine metric, ~150 MB zstd-parquet), targeting a CI-friendly size. Follow-up variants (CohereMedium, OpenAI*, SIFT*, etc.) can be added alongside without structural change. Each variant carries its upstream `assets.zilliz.com` parquet URL, dimension, row count, and curated distance metric. The download path reuses the existing `download_data` / `idempotent_async` infrastructure from `DownloadableDataset`, caching under `{name}/{name}.parquet` in the local benchmark data directory. Signed-off-by: Claude Signed-off-by: Connor Tsui --- vortex-bench/src/lib.rs | 1 + vortex-bench/src/vector_dataset.rs | 159 +++++++++++++++++++++++++++++ 2 files changed, 160 insertions(+) create mode 100644 vortex-bench/src/vector_dataset.rs diff --git a/vortex-bench/src/lib.rs b/vortex-bench/src/lib.rs index b766e2e6a4c..e2829246872 100644 --- a/vortex-bench/src/lib.rs +++ b/vortex-bench/src/lib.rs @@ -53,6 +53,7 @@ pub mod statpopgen; pub mod tpcds; pub mod tpch; pub mod utils; +pub mod vector_dataset; pub use benchmark::Benchmark; pub use benchmark::TableSpec; diff --git a/vortex-bench/src/vector_dataset.rs b/vortex-bench/src/vector_dataset.rs new file mode 100644 index 00000000000..d6a97d20ba9 --- /dev/null +++ b/vortex-bench/src/vector_dataset.rs @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Datasets used by the vector-search benchmark. +//! +//! These are a subset of the public VectorDBBench +//! () datasets — MIT-licensed canonical +//! embedding corpora published by Zilliz under +//! `https://assets.zilliz.com/benchmark//`. Each dataset is distributed as one or more +//! parquet files with a `emb: list` column (the raw embedding vectors) and an +//! `id: int64` column. +//! +//! The URL constants below point at the upstream Zilliz bucket. For CI runs we recommend +//! mirroring these files into an internal bucket first to avoid repeated egress charges on +//! a third-party bucket — mirror setup is a one-off manual operation and documented in the +//! vector-search-bench crate README. + +use std::path::PathBuf; + +use anyhow::Result; +use async_trait::async_trait; +use tokio::fs::File; +use vortex::array::ArrayRef; +use vortex::array::IntoArray; +use vortex::array::stream::ArrayStreamExt; +use vortex::file::OpenOptionsSessionExt; +use vortex::file::WriteOptionsSessionExt; + +use crate::IdempotentPath; +use crate::SESSION; +use crate::conversions::parquet_to_vortex_chunks; +use crate::datasets::Dataset; +use crate::datasets::data_downloads::download_data; +use crate::idempotent_async; + +/// A public embedding-vector dataset used by the vector-search benchmark. +/// +/// Each variant is one of the canonical VectorDBBench corpora, distributed as parquet under +/// the Zilliz public benchmark bucket. The smaller `*Small` sizes are appropriate for CI +/// runs; the larger sizes are intended for local / on-demand experiments. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum VectorDataset { + /// Cohere wiki-22-12, 100K rows × 768 dims, cosine metric. ~307 MB raw / ~150 MB + /// zstd-parquet — the default CI-friendly size. + CohereSmall, +} + +impl VectorDataset { + /// The upstream URL for this dataset's canonical train-split parquet file. + /// + /// **CI note**: point at an internal mirror before enabling this benchmark in CI. + pub fn parquet_url(&self) -> &'static str { + match self { + VectorDataset::CohereSmall => { + "https://assets.zilliz.com/benchmark/cohere_small_100k/train.parquet" + } + } + } + + /// Fixed vector dimensionality for this dataset. + pub fn dim(&self) -> u32 { + match self { + VectorDataset::CohereSmall => 768, + } + } + + /// Expected number of rows in the train split. + pub fn num_rows(&self) -> usize { + match self { + VectorDataset::CohereSmall => 100_000, + } + } + + /// The distance metric the upstream dataset was curated for. v1 only wires cosine, so + /// this is informational today. + pub fn metric(&self) -> VectorMetric { + match self { + VectorDataset::CohereSmall => VectorMetric::Cosine, + } + } +} + +/// Distance metric a dataset was curated for. The vector-search benchmark only wires cosine +/// today, but having this explicit makes it obvious when a future dataset should be paired +/// with L2 or inner-product instead. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum VectorMetric { + /// Cosine similarity: `dot(a, b) / (||a|| * ||b||)`. + Cosine, + /// Squared L2 distance: `sum((a - b)^2)`. + L2, + /// Inner product: `dot(a, b)`. + InnerProduct, +} + +#[async_trait] +impl Dataset for VectorDataset { + fn name(&self) -> &str { + match self { + VectorDataset::CohereSmall => "cohere-small", + } + } + + async fn to_parquet_path(&self) -> Result { + let dir = format!("{}/", self.name()).to_data_path(); + let parquet = dir.join(format!("{}.parquet", self.name())); + download_data(parquet.clone(), self.parquet_url()).await?; + Ok(parquet) + } + + async fn to_vortex_array(&self) -> Result { + let parquet = self.to_parquet_path().await?; + let dir = format!("{}/", self.name()).to_data_path(); + let vortex = dir.join(format!("{}.vortex", self.name())); + + let data = parquet_to_vortex_chunks(parquet).await?; + idempotent_async(&vortex, async |path| -> Result<()> { + SESSION + .write_options() + .write( + &mut File::create(path) + .await + .map_err(|e| anyhow::anyhow!("Failed to create file: {}", e))?, + data.into_array().to_array_stream(), + ) + .await + .map_err(|e| anyhow::anyhow!("Failed to write vortex file: {}", e))?; + Ok(()) + }) + .await?; + + Ok(SESSION + .open_options() + .open_path(vortex.as_path()) + .await? + .scan()? + .into_array_stream()? + .read_all() + .await?) + } +} + +#[cfg(test)] +mod tests { + use super::VectorDataset; + use super::VectorMetric; + use crate::datasets::Dataset; + + #[test] + fn cohere_small_metadata() { + let ds = VectorDataset::CohereSmall; + assert_eq!(ds.name(), "cohere-small"); + assert_eq!(ds.dim(), 768); + assert_eq!(ds.num_rows(), 100_000); + assert_eq!(ds.metric(), VectorMetric::Cosine); + assert!(ds.parquet_url().ends_with("/train.parquet")); + assert!(ds.parquet_url().contains("cohere_small_100k")); + } +} From c6e5dc007fcf0aa76e3ddddc0be03d13979be11f Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 11 Apr 2026 12:53:29 +0000 Subject: [PATCH 04/18] Add vector-search-bench crate with Vortex similarity-search variants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a new standalone benchmark crate at `benchmarks/vector-search-bench/` that measures brute-force cosine similarity search over VectorDBBench-style public embedding corpora. This commit wires up three Vortex variants: - `vortex-uncompressed`: raw `Vector` with no encoding-level compression applied. Baseline for Vortex variants. - `vortex-default`: `BtrBlocksCompressor::default()` applied to the FSL storage — generic lossless compression for float vectors. - `vortex-turboquant`: the full `L2Denorm(SorfTransform(FSL(Dict)))` pipeline via `compress_turboquant`. Lossy; in-memory `.nbytes()` is used as the reported size until `L2Denorm` serialization lands. Four measurements are emitted per `(dataset, variant)` pair: 1. Compressed size (bytes) — on-disk `.vortex` file size for the lossless variants, in-memory footprint for TurboQuant. 2. Full-scan decode time — executes the column to a materialized `FixedSizeListArray`, forcing the entire decompression/dequantization path for each variant. 3. Cosine-similarity execute time — runs `CosineSimilarity(data, query)` into a materialized f32 array. 4. Filter execute time — runs `Binary(Gt, [cosine, threshold])` into a `BoolArray`. This is the end-to-end "query pushdown" path. Output goes through the existing `vortex_bench::measurements` types so results flow through the standard `gh-json` pipeline and appear on the CI dashboard alongside compress-bench / random-access-bench results. `Variant::VortexUncompressed` and `Variant::VortexDefault` both report as `Format::OnDiskVortex`; `Variant::VortexTurboQuant` reports as `Format::VortexTurboQuant`, letting downstream consumers distinguish them via the existing `Target { engine, format }` grouping key. The crate ships with a unit test that builds a 64×128 synthetic `Vector` array, runs all three variants through `prepare_variant` + `run_timings`, and verifies every variant reports a non-zero size and non-zero timings. Parquet-Arrow baseline and Recall@10 quality measurements are deferred to a follow-up commit; this commit intentionally keeps the Vortex-only core focused. Signed-off-by: Claude Signed-off-by: Connor Tsui --- Cargo.lock | 17 + Cargo.toml | 1 + benchmarks/vector-search-bench/Cargo.toml | 31 ++ benchmarks/vector-search-bench/src/lib.rs | 494 +++++++++++++++++++++ benchmarks/vector-search-bench/src/main.rs | 173 ++++++++ 5 files changed, 716 insertions(+) create mode 100644 benchmarks/vector-search-bench/Cargo.toml create mode 100644 benchmarks/vector-search-bench/src/lib.rs create mode 100644 benchmarks/vector-search-bench/src/main.rs diff --git a/Cargo.lock b/Cargo.lock index 424eebac8f6..f192874ce88 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10052,6 +10052,23 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" +[[package]] +name = "vector-search-bench" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "clap", + "indicatif", + "itertools 0.14.0", + "tokio", + "tracing", + "vortex", + "vortex-bench", + "vortex-btrblocks", + "vortex-tensor", +] + [[package]] name = "version_check" version = "0.9.5" diff --git a/Cargo.toml b/Cargo.toml index 9853cf94ed9..0c904489f58 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -59,6 +59,7 @@ members = [ "benchmarks/datafusion-bench", "benchmarks/duckdb-bench", "benchmarks/random-access-bench", + "benchmarks/vector-search-bench", ] exclude = ["java/testfiles", "wasm-test"] resolver = "2" diff --git a/benchmarks/vector-search-bench/Cargo.toml b/benchmarks/vector-search-bench/Cargo.toml new file mode 100644 index 00000000000..f881a5542b1 --- /dev/null +++ b/benchmarks/vector-search-bench/Cargo.toml @@ -0,0 +1,31 @@ +[package] +name = "vector-search-bench" +description = "Vector similarity search benchmarks for Vortex on public embedding datasets" +authors.workspace = true +categories.workspace = true +edition.workspace = true +homepage.workspace = true +include.workspace = true +keywords.workspace = true +license.workspace = true +readme.workspace = true +repository.workspace = true +rust-version.workspace = true +version.workspace = true +publish = false + +[dependencies] +anyhow = { workspace = true } +async-trait = { workspace = true } +clap = { workspace = true, features = ["derive"] } +indicatif = { workspace = true } +itertools = { workspace = true } +tokio = { workspace = true, features = ["full"] } +tracing = { workspace = true } +vortex = { workspace = true } +vortex-bench = { workspace = true } +vortex-btrblocks = { workspace = true } +vortex-tensor = { workspace = true } + +[lints] +workspace = true diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs new file mode 100644 index 00000000000..18356c8ee86 --- /dev/null +++ b/benchmarks/vector-search-bench/src/lib.rs @@ -0,0 +1,494 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Vector similarity-search benchmark core. +//! +//! This crate measures four quantities for each `(dataset, variant)` pair: +//! +//! 1. **Compressed storage size** (bytes on disk, or in-memory `.nbytes()` for variants that +//! don't yet serialize — currently just [`Variant::VortexTurboQuant`]). +//! 2. **Full-scan decode time** — executing the `Vector` column into a +//! materialized [`vortex::array::arrays::FixedSizeListArray`]. +//! 3. **Cosine-similarity execute time** — executing +//! `CosineSimilarity(data, const_query)` into a materialized f32 primitive array. +//! 4. **Filter execute time** — executing +//! `Binary(Gt, [CosineSimilarity, threshold])` into a +//! [`vortex::array::arrays::BoolArray`]. +//! +//! Measurements are emitted via the existing `vortex_bench::measurements` types so that +//! the benchmark results flow through the standard `gh-json` pipeline and appear in the +//! CI dashboard alongside compress-bench / random-access-bench results. + +use std::time::Duration; +use std::time::Instant; + +use anyhow::Context; +use anyhow::Result; +use anyhow::bail; +use clap::ValueEnum; +use vortex::array::ArrayRef; +use vortex::array::IntoArray; +use vortex::array::VortexSessionExecute; +use vortex::array::arrays::BoolArray; +use vortex::array::arrays::FixedSizeListArray; +use vortex::array::arrays::PrimitiveArray; +use vortex::error::VortexExpect; +use vortex::session::VortexSession; +use vortex_bench::Format; +use vortex_bench::SESSION; +use vortex_bench::conversions::list_to_vector_ext; +use vortex_bench::conversions::parquet_to_vortex_chunks; +use vortex_bench::datasets::Dataset; +use vortex_bench::vector_dataset::VectorDataset; +use vortex_btrblocks::BtrBlocksCompressor; +use vortex_tensor::vector_search::build_similarity_search_tree; +use vortex_tensor::vector_search::compress_turboquant; + +/// The threshold used when wrapping the similarity expression in a +/// `Binary(Gt, [cosine, threshold])` filter. Set to a value high enough that random pairs +/// from a ~1.0-norm distribution reject but self-query pairs match. +pub const DEFAULT_THRESHOLD: f32 = 0.8; + +/// Row index used to pick a query vector from the dataset. Using a fixed row keeps queries +/// reproducible across runs and guarantees at least one match (since `cosine(x, x) == 1.0`). +pub const DEFAULT_QUERY_ROW: usize = 0; + +/// A single data-preparation strategy that the benchmark exercises. +/// +/// Each variant corresponds to one column on the "format" axis in downstream dashboards. The +/// `Format` mapping is what gets serialized into the `target.format` field of gh-json +/// output. +#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)] +pub enum Variant { + /// Raw `Vector` with no encoding-level compression applied. + #[clap(name = "vortex-uncompressed")] + VortexUncompressed, + /// `BtrBlocksCompressor::default()` walks into the `Vector` extension and recursively + /// compresses the FSL storage child. This is the "generic lossless" Vortex story for + /// float vectors. + #[clap(name = "vortex-default")] + VortexDefault, + /// The full TurboQuant pipeline: `L2Denorm(SorfTransform(FSL(Dict)))`. Lossy; dramatic + /// size win; requires reporting recall alongside throughput for the comparison to be + /// honest. See [`vortex_tensor::vector_search::compress_turboquant`]. + #[clap(name = "vortex-turboquant")] + VortexTurboQuant, +} + +impl Variant { + /// The Format enum value this variant reports itself as in emitted measurements. + /// Uncompressed and BtrBlocks-default both surface as [`Format::OnDiskVortex`]; the + /// TurboQuant variant gets its own [`Format::VortexTurboQuant`] so dashboards can + /// distinguish them. + pub fn as_format(&self) -> Format { + match self { + Variant::VortexUncompressed => Format::OnDiskVortex, + Variant::VortexDefault => Format::OnDiskVortex, + Variant::VortexTurboQuant => Format::VortexTurboQuant, + } + } + + /// A stable, kebab-cased label used in metric names so dashboards can split apart + /// variants that map to the same Format. + pub fn label(&self) -> &'static str { + match self { + Variant::VortexUncompressed => "vortex-uncompressed", + Variant::VortexDefault => "vortex-default", + Variant::VortexTurboQuant => "vortex-turboquant", + } + } +} + +/// Number of rows in the query vector — matches the database so `ScalarFnArray`'s +/// equal-length contract is satisfied. This type alias exists to make the broadcast +/// semantics obvious at call sites. +type QueryLen = usize; + +/// A materialized Vortex array and its associated execution session / context. +pub struct PreparedDataset { + /// Name used in metric strings — usually the dataset's `Dataset::name()`. + pub name: String, + /// Uncompressed `Vector` array (canonical form). This is reused as the + /// ground-truth basis for TurboQuant recall checks in future commits. + pub uncompressed: ArrayRef, + /// The query vector to use (a single row pulled from the dataset). + pub query: Vec, + /// Parquet file size on disk in bytes — produced by the dataset download step. + pub parquet_bytes: u64, +} + +impl PreparedDataset { + /// Dimension of the underlying vector column. + /// + /// # Panics + /// + /// Panics if `self.uncompressed` is not an `Extension>` — + /// which should be impossible because [`prepare_dataset`] is the only constructor + /// and it guarantees this shape. + pub fn dim(&self) -> u32 { + let fsl_dtype = match self.uncompressed.dtype() { + vortex::dtype::DType::Extension(ext) => ext.storage_dtype(), + other => { + vortex::error::vortex_panic!("expected Extension, got {other}") + } + }; + match fsl_dtype { + vortex::dtype::DType::FixedSizeList(_, dim, _) => *dim, + other => { + vortex::error::vortex_panic!("expected FixedSizeList storage, got {other}") + } + } + } + + /// Number of rows in the uncompressed dataset. + pub fn num_rows(&self) -> usize { + self.uncompressed.len() + } +} + +/// Prepare a dataset by downloading its parquet file, converting the `emb` column to a +/// `Vector` extension array, and extracting a single-row query vector. +pub async fn prepare_dataset(dataset: &VectorDataset) -> Result { + let parquet_path = dataset + .to_parquet_path() + .await + .context("download vector dataset parquet")?; + let parquet_bytes = std::fs::metadata(&parquet_path) + .with_context(|| format!("stat parquet file {:?}", parquet_path))? + .len(); + + tracing::info!( + "ingesting {} parquet from {:?} ({} bytes)", + dataset.name(), + parquet_path, + parquet_bytes + ); + + let chunked = parquet_to_vortex_chunks(parquet_path).await?; + + let struct_array = chunked.into_array(); + let emb_column = extract_emb_column(&struct_array)?; + let uncompressed = list_to_vector_ext(emb_column)?; + + let query = extract_query_row(&uncompressed, DEFAULT_QUERY_ROW)?; + + Ok(PreparedDataset { + name: dataset.name().to_string(), + uncompressed, + query, + parquet_bytes, + }) +} + +/// Project the `emb` column out of a chunked struct array. This rebuilds a chunked list +/// array with just that one column. +fn extract_emb_column(struct_array: &ArrayRef) -> Result { + use vortex::array::arrays::Chunked; + use vortex::array::arrays::ChunkedArray; + use vortex::array::arrays::Struct; + use vortex::array::arrays::chunked::ChunkedArrayExt; + use vortex::array::arrays::struct_::StructArrayExt as _; + + if let Some(chunked) = struct_array.as_opt::() { + let mut emb_chunks: Vec = Vec::with_capacity(chunked.nchunks()); + for chunk in chunked.iter_chunks() { + emb_chunks.push(extract_emb_column(chunk)?); + } + if emb_chunks.is_empty() { + bail!("dataset has no chunks"); + } + return Ok(ChunkedArray::from_iter(emb_chunks).into_array()); + } + + let Some(struct_view) = struct_array.as_opt::() else { + bail!( + "expected dataset chunks to be Struct arrays, got {}", + struct_array.dtype() + ); + }; + + let field = struct_view + .unmasked_field_by_name("emb") + .context("dataset parquet must have an `emb` column")?; + Ok(field.clone()) +} + +/// Pull a single row out of a `Vector` extension array as a plain `Vec`. +fn extract_query_row(vector_ext: &ArrayRef, row: usize) -> Result> { + use vortex::array::arrays::Extension; + use vortex::array::arrays::extension::ExtensionArrayExt; + use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt; + + let mut ctx = SESSION.create_execution_ctx(); + + let ext_view = vector_ext + .as_opt::() + .context("prepared dataset must be a Vector extension array")?; + + // Execute storage array to its canonical FSL form. + let fsl: FixedSizeListArray = ext_view.storage_array().clone().execute(&mut ctx)?; + + let dim_usize = { + let vortex::dtype::DType::FixedSizeList(_, d, _) = fsl.dtype() else { + bail!("storage dtype must be FixedSizeList"); + }; + *d as usize + }; + + if row * dim_usize + dim_usize > vector_ext.len() * dim_usize { + bail!( + "query row {row} out of bounds for dataset of length {}", + vector_ext.len() + ); + } + + let elements: PrimitiveArray = fsl.elements().clone().execute(&mut ctx)?; + let slice = elements.as_slice::(); + let start = row * dim_usize; + Ok(slice[start..start + dim_usize].to_vec()) +} + +/// Apply a `Variant`'s preparation strategy to the uncompressed Vortex array and return the +/// prepared array together with its reported size in bytes. For serializable variants the +/// size is the number of bytes written to a `.vortex` file; for in-memory-only variants +/// (TurboQuant) it's the live `.nbytes()` footprint. +pub async fn prepare_variant( + prepared: &PreparedDataset, + variant: Variant, + session: &VortexSession, +) -> Result<(ArrayRef, u64)> { + match variant { + Variant::VortexUncompressed => { + let array = prepared.uncompressed.clone(); + let size = + measure_on_disk_size(&array, session, &prepared.name, "uncompressed").await?; + Ok((array, size)) + } + Variant::VortexDefault => { + let array = BtrBlocksCompressor::default().compress(&prepared.uncompressed)?; + let size = measure_on_disk_size(&array, session, &prepared.name, "default").await?; + Ok((array, size)) + } + Variant::VortexTurboQuant => { + let mut ctx = session.create_execution_ctx(); + let array = compress_turboquant(prepared.uncompressed.clone(), &mut ctx)?; + // TurboQuant cannot yet round-trip through a Vortex file (L2Denorm metadata + // serialization is not implemented). Report the in-memory `.nbytes()` footprint + // as a proxy. Document this in the benchmark output so consumers of the + // dashboard aren't misled. + let size = array.nbytes() as u64; + Ok((array, size)) + } + } +} + +/// Serialize a prepared Vortex array to a temporary `.vortex` file and return its length. +/// This is what we report as the "compressed size" for serializable variants; it matches +/// the semantics of `compress-bench` which reports the on-disk parquet/vortex file size. +async fn measure_on_disk_size( + array: &ArrayRef, + session: &VortexSession, + dataset_name: &str, + variant_label: &str, +) -> Result { + use vortex::file::WriteOptionsSessionExt; + + let tmp_dir = std::env::temp_dir().join("vortex-vector-search-bench"); + tokio::fs::create_dir_all(&tmp_dir).await?; + let tmp_path = tmp_dir.join(format!("{dataset_name}-{variant_label}.vortex")); + + let mut file = tokio::fs::File::create(&tmp_path).await?; + session + .write_options() + .write(&mut file, array.clone().to_array_stream()) + .await?; + + let metadata = tokio::fs::metadata(&tmp_path).await?; + Ok(metadata.len()) +} + +/// Run the decode / cosine / filter microbenchmarks against a prepared variant array and +/// return the best-of-`iterations` wall times for each measurement. +pub fn run_timings( + variant_array: &ArrayRef, + query: &[f32], + iterations: usize, + session: &VortexSession, +) -> Result { + let _ = QueryLen::default; // touch the type alias so rustc doesn't warn + + let mut decode = Duration::MAX; + let mut cosine = Duration::MAX; + let mut filter = Duration::MAX; + + for _ in 0..iterations { + let mut ctx = session.create_execution_ctx(); + let start = Instant::now(); + let decoded: FixedSizeListArray = decode_full_scan(variant_array, &mut ctx)?; + decode = decode.min(start.elapsed()); + drop(decoded); + } + + for _ in 0..iterations { + let mut ctx = session.create_execution_ctx(); + let start = Instant::now(); + let scores: PrimitiveArray = execute_cosine(variant_array, query, &mut ctx)?; + cosine = cosine.min(start.elapsed()); + drop(scores); + } + + for _ in 0..iterations { + let mut ctx = session.create_execution_ctx(); + let start = Instant::now(); + let matches: BoolArray = execute_filter(variant_array, query, DEFAULT_THRESHOLD, &mut ctx)?; + filter = filter.min(start.elapsed()); + drop(matches); + } + + Ok(VariantTimings { + decode, + cosine, + filter, + }) +} + +/// Timing summary for one `(dataset, variant)` pair. +#[derive(Debug, Clone, Copy)] +pub struct VariantTimings { + /// Wall time for a full column decode. + pub decode: Duration, + /// Wall time for the cosine_similarity scalar-function execution. + pub cosine: Duration, + /// Wall time for the full `Binary(Gt, [cosine, threshold])` expression. + pub filter: Duration, +} + +/// Fully materialize the input column so the measurement captures *all* decompression +/// work — the extension shell, the FSL storage, and the inner element buffer. +/// +/// For the Vortex-uncompressed variant this is cheap (bitwise copy / no-op). For +/// BtrBlocks-default it includes FSL decompression. For TurboQuant it includes running +/// the inverse SORF rotation + dictionary lookup through the scalar-fn pipeline. +fn decode_full_scan( + array: &ArrayRef, + ctx: &mut vortex::array::ExecutionCtx, +) -> Result { + use vortex::array::arrays::ExtensionArray; + use vortex::array::arrays::extension::ExtensionArrayExt; + + let ext: ExtensionArray = array.clone().execute(ctx)?; + let fsl: FixedSizeListArray = ext.storage_array().clone().execute(ctx)?; + Ok(fsl) +} + +fn execute_cosine( + data: &ArrayRef, + query: &[f32], + ctx: &mut vortex::array::ExecutionCtx, +) -> Result { + use vortex_tensor::scalar_fns::cosine_similarity::CosineSimilarity; + use vortex_tensor::vector_search::build_constant_query_vector; + + let num_rows = data.len(); + let query_vec = build_constant_query_vector(query, num_rows)?; + let cosine = CosineSimilarity::try_new_array(data.clone(), query_vec, num_rows) + .vortex_expect("cosine similarity accepts matching Vector inputs") + .into_array(); + Ok(cosine.execute(ctx)?) +} + +fn execute_filter( + data: &ArrayRef, + query: &[f32], + threshold: f32, + ctx: &mut vortex::array::ExecutionCtx, +) -> Result { + let tree = build_similarity_search_tree(data.clone(), query, threshold)?; + Ok(tree.execute(ctx)?) +} + +#[cfg(test)] +mod tests { + use vortex::array::IntoArray; + use vortex::array::arrays::ExtensionArray; + use vortex::array::arrays::FixedSizeListArray; + use vortex::array::arrays::PrimitiveArray; + use vortex::array::arrays::extension::ExtensionArrayExt; + use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt; + use vortex::array::extension::EmptyMetadata; + use vortex::array::validity::Validity; + use vortex::buffer::BufferMut; + use vortex::dtype::extension::ExtDType; + use vortex_bench::SESSION; + use vortex_tensor::vector::Vector; + + use super::*; + + fn synthetic_vector(dim: u32, num_rows: usize, seed: u64) -> ArrayRef { + let mut buf = BufferMut::::with_capacity(num_rows * dim as usize); + let mut state = seed; + for _ in 0..(num_rows * dim as usize) { + // Simple xorshift — deterministic, distribution not important for this test. + state ^= state << 13; + state ^= state >> 7; + state ^= state << 17; + let v = ((state & 0xFFFF) as f32 / 32768.0) - 1.0; + buf.push(v); + } + let elements = PrimitiveArray::new::(buf.freeze(), Validity::NonNullable).into_array(); + let fsl = + FixedSizeListArray::try_new(elements, dim, Validity::NonNullable, num_rows).unwrap(); + let ext_dtype = ExtDType::::try_new(EmptyMetadata, fsl.dtype().clone()) + .unwrap() + .erased(); + ExtensionArray::new(ext_dtype, fsl.into_array()).into_array() + } + + #[test] + fn prepare_variant_produces_non_empty_array_for_all_variants() { + let dim = 128u32; + let num_rows = 64usize; + let uncompressed = synthetic_vector(dim, num_rows, 0xC0FFEE); + + let ext = uncompressed + .as_opt::() + .unwrap(); + let mut ctx = SESSION.create_execution_ctx(); + let fsl: FixedSizeListArray = ext.storage_array().clone().execute(&mut ctx).unwrap(); + let elements: PrimitiveArray = fsl.elements().clone().execute(&mut ctx).unwrap(); + let slice = elements.as_slice::(); + let query = slice[..dim as usize].to_vec(); + + let prepared = PreparedDataset { + name: "synthetic".to_string(), + uncompressed: uncompressed.clone(), + query, + parquet_bytes: 0, + }; + + for variant in [ + Variant::VortexUncompressed, + Variant::VortexDefault, + Variant::VortexTurboQuant, + ] { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + let (array, size) = rt + .block_on(prepare_variant(&prepared, variant, &SESSION)) + .unwrap(); + assert_eq!( + array.len(), + num_rows, + "variant {variant:?} changed row count" + ); + assert!(size > 0, "variant {variant:?} reported zero size"); + + let timings = run_timings(&array, &prepared.query, 2, &SESSION).unwrap(); + assert!(timings.decode > Duration::ZERO); + assert!(timings.cosine > Duration::ZERO); + assert!(timings.filter > Duration::ZERO); + } + } +} diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs new file mode 100644 index 00000000000..74f4c740f6b --- /dev/null +++ b/benchmarks/vector-search-bench/src/main.rs @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! `vector-search-bench` — brute-force cosine-similarity benchmark over public VectorDBBench +//! embedding corpora. +//! +//! Usage: +//! +//! ```bash +//! cargo run -p vector-search-bench --release -- \ +//! --datasets cohere-small \ +//! --variants vortex-uncompressed,vortex-default,vortex-turboquant \ +//! --iterations 5 \ +//! -d table +//! ``` + +use std::borrow::Cow; +use std::path::PathBuf; + +use anyhow::Result; +use clap::Parser; +use indicatif::ProgressBar; +use vector_search_bench::Variant; +use vector_search_bench::prepare_dataset; +use vector_search_bench::prepare_variant; +use vector_search_bench::run_timings; +use vortex_bench::SESSION; +use vortex_bench::create_output_writer; +use vortex_bench::display::DisplayFormat; +use vortex_bench::display::print_measurements_json; +use vortex_bench::measurements::CompressionTimingMeasurement; +use vortex_bench::measurements::CustomUnitMeasurement; +use vortex_bench::setup_logging_and_tracing; +use vortex_bench::vector_dataset::VectorDataset; + +const BENCHMARK_ID: &str = "vector-search"; + +/// Command-line arguments for `vector-search-bench`. +#[derive(Parser, Debug)] +#[command(version, about, long_about = None)] +struct Args { + /// Number of timed iterations per measurement. The reported time is the minimum across + /// iterations (matches compress-bench convention). + #[arg(short, long, default_value_t = 5)] + iterations: usize, + + /// Subset of datasets to run. Defaults to Cohere-small. + #[arg(long, value_delimiter = ',', value_enum, default_values_t = vec![SelectableDataset::CohereSmall])] + datasets: Vec, + + /// Subset of variants to exercise. Defaults to all three Vortex variants. + #[arg(long, value_delimiter = ',', value_enum, default_values_t = vec![Variant::VortexUncompressed, Variant::VortexDefault, Variant::VortexTurboQuant])] + variants: Vec, + + /// Output display format (`table` for humans, `gh-json` for CI ingestion). + #[arg(short, long, default_value_t, value_enum)] + display_format: DisplayFormat, + + /// If set, write output to this file instead of stdout. + #[arg(short, long)] + output_path: Option, + + /// Verbose logging. + #[arg(short, long)] + verbose: bool, + + /// Enable perfetto tracing output. + #[arg(long)] + tracing: bool, +} + +#[derive(clap::ValueEnum, Clone, Copy, Debug, PartialEq, Eq)] +enum SelectableDataset { + #[clap(name = "cohere-small")] + CohereSmall, +} + +impl SelectableDataset { + fn into_dataset(self) -> VectorDataset { + match self { + SelectableDataset::CohereSmall => VectorDataset::CohereSmall, + } + } +} + +#[tokio::main] +async fn main() -> Result<()> { + let args = Args::parse(); + setup_logging_and_tracing(args.verbose, args.tracing)?; + + let datasets: Vec = args + .datasets + .iter() + .copied() + .map(SelectableDataset::into_dataset) + .collect(); + + let total_work = datasets.len() * args.variants.len(); + let progress = ProgressBar::new(total_work as u64); + + let mut timings: Vec = Vec::new(); + let mut sizes: Vec = Vec::new(); + + for dataset in &datasets { + let prepared = prepare_dataset(dataset).await?; + tracing::info!( + "prepared {}: dim={}, num_rows={}", + prepared.name, + prepared.dim(), + prepared.num_rows() + ); + + for &variant in &args.variants { + let (variant_array, size_bytes) = prepare_variant(&prepared, variant, &SESSION).await?; + + let variant_label = variant.label(); + let bench_name = format!("{variant_label}/{}", prepared.name); + + sizes.push(CustomUnitMeasurement { + name: format!("{variant_label} size/{}", prepared.name), + format: variant.as_format(), + unit: Cow::from("bytes"), + value: size_bytes as f64, + }); + + let variant_timings = + run_timings(&variant_array, &prepared.query, args.iterations, &SESSION)?; + + timings.push(CompressionTimingMeasurement { + name: format!("decode time/{bench_name}"), + format: variant.as_format(), + time: variant_timings.decode, + }); + timings.push(CompressionTimingMeasurement { + name: format!("cosine-similarity time/{bench_name}"), + format: variant.as_format(), + time: variant_timings.cosine, + }); + timings.push(CompressionTimingMeasurement { + name: format!("cosine-filter time/{bench_name}"), + format: variant.as_format(), + time: variant_timings.filter, + }); + + progress.inc(1); + } + } + progress.finish(); + + let mut writer = create_output_writer(&args.display_format, args.output_path, BENCHMARK_ID)?; + match args.display_format { + DisplayFormat::Table => { + // Our variants span multiple `Format` values *and* multiple labels that share a + // single `Format`, so the existing `render_table` helper (which groups by + // `Target`) would collapse them. Emit one line per measurement instead; this is + // only used for developer inspection — CI consumes `gh-json` via the arm below. + for timing in &timings { + writeln!(writer, "{} {} ns", timing.name, timing.time.as_nanos())?; + } + for size in &sizes { + writeln!(writer, "{} {} {}", size.name, size.value, size.unit)?; + } + } + DisplayFormat::GhJson => { + print_measurements_json(&mut writer, timings)?; + print_measurements_json(&mut writer, sizes)?; + } + } + + Ok(()) +} + +use std::io::Write; From bc3861f28ee849e640ef5ccd2b51fcaff6cd69d3 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 11 Apr 2026 13:00:42 +0000 Subject: [PATCH 05/18] vector-search-bench: add parquet-Arrow baseline and Recall@10 measurement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This completes the v1 scope of the vector-search benchmark by adding two orthogonal pieces of functionality: 1. **Parquet-Arrow hand-rolled cosine baseline** (`parquet_baseline.rs`) reads the canonical parquet file via `parquet::arrow`, decodes the `emb` column to a flat `Vec`, and runs a straightforward Rust cosine loop. This is the "what you'd do without Vortex" external floor — necessary for the benchmark to tell a credible story about Vortex's value vs. raw parquet. Reports size, decode time, cosine time, and filter time in the same format as the Vortex variants, under `Format::Parquet`. Handles both `List` and `FixedSizeList` parquet schemas. 2. **Recall@K quality measurement** (`recall.rs`) computes the fraction of the exact top-K nearest neighbours that the lossy TurboQuant variant recovers, using the uncompressed Vortex scan as local ground truth (not VectorDBBench's shipped `neighbors.parquet`, which would require an index and is out of scope). Samples are deterministic — rows picked at uniform intervals across the dataset — so results are stable across runs. Only TurboQuant is checked; lossless variants are trivially 1.0. Emitted as a `CustomUnitMeasurement` with unit "recall" so it flows through the existing gh-json pipeline alongside the size/timing measurements. The binary picks this all up behind two new CLI flags: `--parquet-baseline` (default true) and `--recall-queries N` (default 100), plus `--recall-k K` (default 10). Four unit tests cover the new functionality: - `parquet_baseline_reads_fsl_column` — end-to-end parquet write + read + cosine loop on a 3×3 fixture. - `uncompressed_has_perfect_self_recall` — sanity check that recall@10 of a variant against itself is exactly 1.0. - `turboquant_recall_is_reasonable_for_synthetic_data` — loose lower bound (>= 0.3) on TurboQuant recall for a 64×128 random dataset to catch total regressions without being flaky. A small `test_utils` submodule was extracted from the lib's existing tests module so the recall tests can reuse the synthetic vector generator. Signed-off-by: Claude Signed-off-by: Connor Tsui --- Cargo.lock | 4 + benchmarks/vector-search-bench/Cargo.toml | 6 + benchmarks/vector-search-bench/src/lib.rs | 30 +- benchmarks/vector-search-bench/src/main.rs | 88 +++++ .../src/parquet_baseline.rs | 303 ++++++++++++++++++ benchmarks/vector-search-bench/src/recall.rs | 188 +++++++++++ 6 files changed, 611 insertions(+), 8 deletions(-) create mode 100644 benchmarks/vector-search-bench/src/parquet_baseline.rs create mode 100644 benchmarks/vector-search-bench/src/recall.rs diff --git a/Cargo.lock b/Cargo.lock index f192874ce88..c56f8e5330e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10057,10 +10057,14 @@ name = "vector-search-bench" version = "0.1.0" dependencies = [ "anyhow", + "arrow-array 58.0.0", + "arrow-schema 58.0.0", "async-trait", "clap", "indicatif", "itertools 0.14.0", + "parquet 58.0.0", + "tempfile", "tokio", "tracing", "vortex", diff --git a/benchmarks/vector-search-bench/Cargo.toml b/benchmarks/vector-search-bench/Cargo.toml index f881a5542b1..068cf400d71 100644 --- a/benchmarks/vector-search-bench/Cargo.toml +++ b/benchmarks/vector-search-bench/Cargo.toml @@ -16,10 +16,13 @@ publish = false [dependencies] anyhow = { workspace = true } +arrow-array = { workspace = true } +arrow-schema = { workspace = true } async-trait = { workspace = true } clap = { workspace = true, features = ["derive"] } indicatif = { workspace = true } itertools = { workspace = true } +parquet = { workspace = true } tokio = { workspace = true, features = ["full"] } tracing = { workspace = true } vortex = { workspace = true } @@ -27,5 +30,8 @@ vortex-bench = { workspace = true } vortex-btrblocks = { workspace = true } vortex-tensor = { workspace = true } +[dev-dependencies] +tempfile = { workspace = true } + [lints] workspace = true diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs index 18356c8ee86..27f61f39ed7 100644 --- a/benchmarks/vector-search-bench/src/lib.rs +++ b/benchmarks/vector-search-bench/src/lib.rs @@ -22,6 +22,9 @@ use std::time::Duration; use std::time::Instant; +pub mod parquet_baseline; +pub mod recall; + use anyhow::Context; use anyhow::Result; use anyhow::bail; @@ -407,28 +410,27 @@ fn execute_filter( Ok(tree.execute(ctx)?) } +/// Test-only helpers shared between the unit tests in this crate's submodules. #[cfg(test)] -mod tests { +pub(crate) mod test_utils { + use vortex::array::ArrayRef; use vortex::array::IntoArray; use vortex::array::arrays::ExtensionArray; use vortex::array::arrays::FixedSizeListArray; use vortex::array::arrays::PrimitiveArray; - use vortex::array::arrays::extension::ExtensionArrayExt; - use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt; use vortex::array::extension::EmptyMetadata; use vortex::array::validity::Validity; use vortex::buffer::BufferMut; use vortex::dtype::extension::ExtDType; - use vortex_bench::SESSION; use vortex_tensor::vector::Vector; - use super::*; - - fn synthetic_vector(dim: u32, num_rows: usize, seed: u64) -> ArrayRef { + /// Build a deterministic `Vector` extension array of `num_rows` rows for + /// tests. The PRNG is a trivial xorshift keyed by `seed`; we don't care about the + /// distribution beyond "not all zeros". + pub fn synthetic_vector(dim: u32, num_rows: usize, seed: u64) -> ArrayRef { let mut buf = BufferMut::::with_capacity(num_rows * dim as usize); let mut state = seed; for _ in 0..(num_rows * dim as usize) { - // Simple xorshift — deterministic, distribution not important for this test. state ^= state << 13; state ^= state >> 7; state ^= state << 17; @@ -443,6 +445,18 @@ mod tests { .erased(); ExtensionArray::new(ext_dtype, fsl.into_array()).into_array() } +} + +#[cfg(test)] +mod tests { + use vortex::array::arrays::FixedSizeListArray; + use vortex::array::arrays::PrimitiveArray; + use vortex::array::arrays::extension::ExtensionArrayExt; + use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt; + use vortex_bench::SESSION; + + use super::test_utils::synthetic_vector; + use super::*; #[test] fn prepare_variant_produces_non_empty_array_for_all_variants() { diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs index 74f4c740f6b..5a5ef57a742 100644 --- a/benchmarks/vector-search-bench/src/main.rs +++ b/benchmarks/vector-search-bench/src/main.rs @@ -20,12 +20,18 @@ use std::path::PathBuf; use anyhow::Result; use clap::Parser; use indicatif::ProgressBar; +use vector_search_bench::DEFAULT_THRESHOLD; use vector_search_bench::Variant; +use vector_search_bench::parquet_baseline::run_parquet_baseline_timings; use vector_search_bench::prepare_dataset; use vector_search_bench::prepare_variant; +use vector_search_bench::recall::DEFAULT_TOP_K; +use vector_search_bench::recall::measure_recall_at_k; use vector_search_bench::run_timings; +use vortex_bench::Format; use vortex_bench::SESSION; use vortex_bench::create_output_writer; +use vortex_bench::datasets::Dataset; use vortex_bench::display::DisplayFormat; use vortex_bench::display::print_measurements_json; use vortex_bench::measurements::CompressionTimingMeasurement; @@ -52,6 +58,20 @@ struct Args { #[arg(long, value_delimiter = ',', value_enum, default_values_t = vec![Variant::VortexUncompressed, Variant::VortexDefault, Variant::VortexTurboQuant])] variants: Vec, + /// Also run the Parquet-Arrow hand-rolled cosine baseline as an additional variant. + /// Default `true` — disable only when you intentionally want a Vortex-only comparison. + #[arg(long, default_value_t = true)] + parquet_baseline: bool, + + /// Number of query rows sampled when computing Recall@K for TurboQuant. 0 disables + /// the quality measurement entirely (useful for smoke tests). + #[arg(long, default_value_t = 100)] + recall_queries: usize, + + /// K in Recall@K. Defaults to 10, matching VectorDBBench conventions. + #[arg(long, default_value_t = DEFAULT_TOP_K)] + recall_k: usize, + /// Output display format (`table` for humans, `gh-json` for CI ingestion). #[arg(short, long, default_value_t, value_enum)] display_format: DisplayFormat, @@ -101,6 +121,8 @@ async fn main() -> Result<()> { let mut timings: Vec = Vec::new(); let mut sizes: Vec = Vec::new(); + let mut recalls: Vec = Vec::new(); + for dataset in &datasets { let prepared = prepare_dataset(dataset).await?; tracing::info!( @@ -110,6 +132,44 @@ async fn main() -> Result<()> { prepared.num_rows() ); + // Parquet-Arrow baseline. Emitted as a separate pseudo-variant with label + // `parquet` / Format::Parquet so it shows up in dashboards next to the Vortex + // variants. + if args.parquet_baseline { + let parquet_path = dataset.to_parquet_path().await?; + let baseline_timings = run_parquet_baseline_timings( + &parquet_path, + &prepared.query, + DEFAULT_THRESHOLD, + args.iterations, + )?; + + let label = "parquet"; + let bench_name = format!("{label}/{}", prepared.name); + + sizes.push(CustomUnitMeasurement { + name: format!("{label} size/{}", prepared.name), + format: Format::Parquet, + unit: Cow::from("bytes"), + value: prepared.parquet_bytes as f64, + }); + timings.push(CompressionTimingMeasurement { + name: format!("decode time/{bench_name}"), + format: Format::Parquet, + time: baseline_timings.decode, + }); + timings.push(CompressionTimingMeasurement { + name: format!("cosine-similarity time/{bench_name}"), + format: Format::Parquet, + time: baseline_timings.cosine, + }); + timings.push(CompressionTimingMeasurement { + name: format!("cosine-filter time/{bench_name}"), + format: Format::Parquet, + time: baseline_timings.filter, + }); + } + for &variant in &args.variants { let (variant_array, size_bytes) = prepare_variant(&prepared, variant, &SESSION).await?; @@ -142,6 +202,26 @@ async fn main() -> Result<()> { time: variant_timings.filter, }); + // Recall@K quality measurement for lossy variants only. The lossless + // variants (uncompressed + BtrBlocks default) are trivially 1.0 against + // the uncompressed ground truth, so we skip them to avoid noise. + if args.recall_queries > 0 && variant == Variant::VortexTurboQuant { + let recall = measure_recall_at_k( + &prepared.uncompressed, + &variant_array, + args.recall_queries, + args.recall_k, + &SESSION, + )?; + tracing::info!("Recall@{} for {}: {:.4}", args.recall_k, bench_name, recall); + recalls.push(CustomUnitMeasurement { + name: format!("recall@{}/{bench_name}", args.recall_k), + format: variant.as_format(), + unit: Cow::from("recall"), + value: recall, + }); + } + progress.inc(1); } } @@ -160,10 +240,18 @@ async fn main() -> Result<()> { for size in &sizes { writeln!(writer, "{} {} {}", size.name, size.value, size.unit)?; } + for recall in &recalls { + writeln!( + writer, + "{} {:.4} {}", + recall.name, recall.value, recall.unit + )?; + } } DisplayFormat::GhJson => { print_measurements_json(&mut writer, timings)?; print_measurements_json(&mut writer, sizes)?; + print_measurements_json(&mut writer, recalls)?; } } diff --git a/benchmarks/vector-search-bench/src/parquet_baseline.rs b/benchmarks/vector-search-bench/src/parquet_baseline.rs new file mode 100644 index 00000000000..b80a2308370 --- /dev/null +++ b/benchmarks/vector-search-bench/src/parquet_baseline.rs @@ -0,0 +1,303 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Parquet-Arrow hand-rolled cosine similarity baseline. +//! +//! This module provides the "what you'd do without Vortex" floor for the vector-search +//! benchmark. It reads the canonical parquet file for a dataset via `parquet::arrow`, +//! decodes the `emb` column to an Arrow `FixedSizeListArray`, and then runs a +//! straightforward Rust cosine-similarity loop — no scalar functions, no lazy expressions, +//! no index. +//! +//! The four measurements produced mirror those of the Vortex variants so dashboards can +//! put the parquet bar right next to the vortex bars: +//! +//! 1. Compressed size — the on-disk parquet file in bytes. +//! 2. Full-scan decode time — parquet → arrow record batches → concatenated +//! `FixedSizeListArray`. +//! 3. Cosine-similarity execute time — hand-rolled loop producing a `Vec` of scores. +//! 4. Filter execute time — the same loop materializing into a `Vec` where +//! `score > threshold`. +//! +//! This module does *not* include the parquet decode time in the cosine/filter wall +//! times. Decoding is treated as its own measurement. This matches how the Vortex variants +//! separate decode from compute. + +use std::fs::File; +use std::path::Path; +use std::time::Duration; +use std::time::Instant; + +use anyhow::Context; +use anyhow::Result; +use anyhow::bail; +use arrow_array::Array; +use arrow_array::FixedSizeListArray; +use arrow_array::Float32Array; +use arrow_array::ListArray; +use arrow_array::RecordBatch; +use arrow_array::cast::AsArray; +use arrow_schema::DataType; +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; + +use crate::VariantTimings; + +/// Read the entire `emb` column of a parquet file into a single flat `Vec`, along +/// with the dimension and row count. +pub fn read_parquet_embedding_column(parquet_path: &Path) -> Result { + let file = File::open(parquet_path) + .with_context(|| format!("open parquet file {}", parquet_path.display()))?; + let file_size = file.metadata()?.len(); + let builder = ParquetRecordBatchReaderBuilder::try_new(file)?; + + // Locate the `emb` column and sanity-check its type. + let (emb_idx, emb_field) = builder + .schema() + .column_with_name("emb") + .context("parquet schema missing `emb` column")?; + + // VectorDBBench parquet files use `list`; some others use `fixed_size_list`. + // Both need to be supported — the canonical parquet emit from arrow-rs is `list` + // since parquet has no fixed-size-list logical type. + let element_dtype = match emb_field.data_type() { + DataType::List(field) | DataType::LargeList(field) | DataType::FixedSizeList(field, _) => { + field.data_type().clone() + } + other => bail!("emb column must be a list of float, got {other:?}"), + }; + if !matches!(element_dtype, DataType::Float32) { + bail!( + "emb column element type must be Float32, got {:?}", + element_dtype + ); + } + let _ = emb_idx; + + let reader = builder.build()?; + let batches: Vec = reader.collect::, _>>()?; + + let mut data = Vec::::new(); + let mut num_rows = 0usize; + let mut inferred_dim: Option = None; + + for batch in batches.iter() { + let column = batch + .column_by_name("emb") + .context("emb column missing from record batch")?; + append_batch(column, &mut data, &mut inferred_dim, &mut num_rows)?; + } + + let dim = inferred_dim.context("parquet file has zero rows — cannot infer dimension")?; + Ok(ParquetBaselineData { + elements: data, + dim, + num_rows, + file_size, + }) +} + +fn append_batch( + column: &dyn Array, + data: &mut Vec, + inferred_dim: &mut Option, + num_rows: &mut usize, +) -> Result<()> { + if let Some(fsl) = column.as_any().downcast_ref::() { + let dim = fsl.value_length() as usize; + maybe_set_dim(inferred_dim, dim)?; + let values = fsl + .values() + .as_any() + .downcast_ref::() + .context("FSL emb column must have Float32 values")?; + data.extend_from_slice(values.values()); + *num_rows += fsl.len(); + return Ok(()); + } + + if let Some(list) = column.as_any().downcast_ref::() { + let values: &Float32Array = list + .values() + .as_primitive_opt::() + .context("List emb column must have Float32 values")?; + let offsets = list.value_offsets(); + for i in 0..list.len() { + let start = offsets[i] as usize; + let end = offsets[i + 1] as usize; + let row_len = end - start; + maybe_set_dim(inferred_dim, row_len)?; + data.extend_from_slice(&values.values()[start..end]); + *num_rows += 1; + } + return Ok(()); + } + + bail!( + "emb column has unsupported arrow type {:?}", + column.data_type() + ); +} + +fn maybe_set_dim(inferred_dim: &mut Option, new_dim: usize) -> Result<()> { + match inferred_dim { + Some(d) if *d == new_dim => Ok(()), + Some(d) => bail!("inconsistent emb dimensions: saw {d} then {new_dim}"), + None if new_dim == 0 => bail!("emb row has zero elements"), + None => { + *inferred_dim = Some(new_dim); + Ok(()) + } + } +} + +/// The flattened representation of a parquet file's embedding column, suitable for a +/// hand-rolled distance loop. +pub struct ParquetBaselineData { + /// All rows concatenated: `elements.len() == num_rows * dim`. + pub elements: Vec, + /// Vector dimensionality. + pub dim: usize, + /// Number of rows. + pub num_rows: usize, + /// On-disk size of the parquet file in bytes. + pub file_size: u64, +} + +/// Run the decode / cosine / filter baseline microbenchmarks and return the best-of-N +/// wall times. Decoding is re-parquet-reading from disk on each iteration (matches how +/// the Vortex variants also re-execute from scratch each iteration). +pub fn run_parquet_baseline_timings( + parquet_path: &Path, + query: &[f32], + threshold: f32, + iterations: usize, +) -> Result { + let mut decode = Duration::MAX; + let mut cosine = Duration::MAX; + let mut filter = Duration::MAX; + + for _ in 0..iterations { + let start = Instant::now(); + let data = read_parquet_embedding_column(parquet_path)?; + decode = decode.min(start.elapsed()); + + let start = Instant::now(); + let scores = cosine_loop(&data.elements, data.num_rows, data.dim, query); + cosine = cosine.min(start.elapsed()); + debug_assert_eq!(scores.len(), data.num_rows); + + let start = Instant::now(); + let matches = filter_loop(&scores, threshold); + filter = filter.min(start.elapsed()); + debug_assert_eq!(matches.len(), data.num_rows); + } + + Ok(VariantTimings { + decode, + cosine, + filter, + }) +} + +/// Compute cosine similarity for every row against `query`. The query is assumed to match +/// the database vectors' dimension. Returns one f32 score per row; scores for zero-norm +/// rows or a zero-norm query are 0.0 by convention. +pub fn cosine_loop(elements: &[f32], num_rows: usize, dim: usize, query: &[f32]) -> Vec { + assert_eq!(query.len(), dim); + assert_eq!(elements.len(), num_rows * dim); + + let query_norm = query.iter().map(|&q| q * q).sum::().sqrt(); + let mut out = Vec::with_capacity(num_rows); + if query_norm == 0.0 { + out.resize(num_rows, 0.0); + return out; + } + + for row in 0..num_rows { + let base = row * dim; + let slice = &elements[base..base + dim]; + let mut dot = 0.0f32; + let mut sq = 0.0f32; + for i in 0..dim { + dot += slice[i] * query[i]; + sq += slice[i] * slice[i]; + } + let norm = sq.sqrt(); + if norm == 0.0 { + out.push(0.0); + } else { + out.push(dot / (norm * query_norm)); + } + } + out +} + +/// Build the `cosine > threshold` boolean mask. +pub fn filter_loop(scores: &[f32], threshold: f32) -> Vec { + scores.iter().map(|&s| s > threshold).collect() +} + +#[cfg(test)] +mod tests { + use std::fs::File; + use std::sync::Arc; + + use arrow_array::RecordBatch; + use arrow_array::builder::FixedSizeListBuilder; + use arrow_array::builder::Float32Builder; + use arrow_schema::DataType; + use arrow_schema::Field; + use arrow_schema::Schema; + use parquet::arrow::ArrowWriter; + use tempfile::NamedTempFile; + + use super::*; + + /// Build a minimal parquet file with an `emb: FixedSizeList` column and + /// verify the baseline pipeline produces the expected scores. + fn write_tiny_fsl_parquet(dim: i32, rows: &[&[f32]]) -> Result { + let schema = Arc::new(Schema::new(vec![Field::new( + "emb", + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), dim), + false, + )])); + + let file = NamedTempFile::new()?; + let mut writer = + ArrowWriter::try_new(File::create(file.path())?, Arc::clone(&schema), None)?; + + let dim_usize = usize::try_from(dim).unwrap(); + let mut builder = FixedSizeListBuilder::new(Float32Builder::new(), dim); + for row in rows { + assert_eq!(row.len(), dim_usize); + for &v in row.iter() { + builder.values().append_value(v); + } + builder.append(true); + } + let array = builder.finish(); + let batch = RecordBatch::try_new(schema, vec![Arc::new(array)])?; + writer.write(&batch)?; + writer.close()?; + Ok(file) + } + + #[test] + fn parquet_baseline_reads_fsl_column() { + let file = + write_tiny_fsl_parquet(3, &[&[1.0, 0.0, 0.0], &[0.0, 1.0, 0.0], &[1.0, 0.0, 0.0]]) + .unwrap(); + + let data = read_parquet_embedding_column(file.path()).unwrap(); + assert_eq!(data.dim, 3); + assert_eq!(data.num_rows, 3); + assert_eq!(data.elements.len(), 9); + + let query = [1.0f32, 0.0, 0.0]; + let scores = cosine_loop(&data.elements, data.num_rows, data.dim, &query); + assert_eq!(scores, vec![1.0, 0.0, 1.0]); + + let mask = filter_loop(&scores, 0.5); + assert_eq!(mask, vec![true, false, true]); + } +} diff --git a/benchmarks/vector-search-bench/src/recall.rs b/benchmarks/vector-search-bench/src/recall.rs new file mode 100644 index 00000000000..15132561b4d --- /dev/null +++ b/benchmarks/vector-search-bench/src/recall.rs @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Recall@K quality measurement for lossy vector-search variants. +//! +//! This module computes the fraction of the true top-K nearest neighbours that a +//! lossy encoding (today just TurboQuant) recovers, using the uncompressed Vortex +//! scan as the local ground truth. Recall is averaged over a small number of sampled +//! query rows. +//! +//! This is explicitly a *relative* recall — we compare TurboQuant-retrieved neighbours +//! against the neighbours that the *same* cosine-similarity expression finds in the +//! uncompressed scan, not against VectorDBBench's shipped `neighbors.parquet`. Comparing +//! against external ground truth would require an index (which Vortex doesn't have) and +//! is structurally out of scope for a file-format benchmark. + +use anyhow::Result; +use vortex::array::ArrayRef; +use vortex::array::IntoArray; +use vortex::array::VortexSessionExecute; +use vortex::array::arrays::PrimitiveArray; +use vortex::array::arrays::extension::ExtensionArrayExt; +use vortex::error::VortexExpect; +use vortex::session::VortexSession; +use vortex::utils::aliases::hash_set::HashSet; +use vortex_tensor::scalar_fns::cosine_similarity::CosineSimilarity; +use vortex_tensor::vector_search::build_constant_query_vector; + +/// Size of the neighbour set we compare. 10 is the standard VectorDBBench default. +pub const DEFAULT_TOP_K: usize = 10; + +/// Compute recall@K for the lossy `compressed` variant against the `uncompressed` +/// ground-truth variant, averaged over `num_queries` sampled query rows. +/// +/// Query selection is deterministic: rows are picked uniformly across the dataset at +/// `step = uncompressed.len() / num_queries` intervals. This keeps the result stable +/// across runs and avoids needing to thread a PRNG seed into the benchmark CLI. +pub fn measure_recall_at_k( + uncompressed: &ArrayRef, + compressed: &ArrayRef, + num_queries: usize, + top_k: usize, + session: &VortexSession, +) -> Result { + assert!( + num_queries > 0, + "measure_recall_at_k requires num_queries > 0" + ); + assert!(top_k > 0, "measure_recall_at_k requires top_k > 0"); + let num_rows = uncompressed.len(); + assert_eq!( + compressed.len(), + num_rows, + "uncompressed and compressed arrays must have the same row count" + ); + assert!(num_rows >= top_k, "dataset must have at least top_k rows"); + + let step = (num_rows / num_queries).max(1); + + let mut total_hits: usize = 0; + let mut total_checked: usize = 0; + + for q in 0..num_queries { + let row = (q * step).min(num_rows - 1); + let query = extract_query_row(uncompressed, row, session)?; + + let gt_scores = score_all_rows(uncompressed, &query, session)?; + let truth = top_k_indices(>_scores, top_k); + + let lossy_scores = score_all_rows(compressed, &query, session)?; + let lossy = top_k_indices(&lossy_scores, top_k); + + let truth_set: HashSet = truth.iter().copied().collect(); + total_hits += lossy.iter().filter(|idx| truth_set.contains(*idx)).count(); + total_checked += top_k; + } + + Ok(total_hits as f64 / total_checked as f64) +} + +fn extract_query_row( + vector_ext: &ArrayRef, + row: usize, + session: &VortexSession, +) -> Result> { + use anyhow::Context; + use vortex::array::arrays::Extension; + use vortex::array::arrays::FixedSizeListArray; + use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt; + + let mut ctx = session.create_execution_ctx(); + let ext = vector_ext + .as_opt::() + .context("extract_query_row expects an Extension array")?; + let fsl: FixedSizeListArray = ext.storage_array().clone().execute(&mut ctx)?; + + let dim_usize = match fsl.dtype() { + vortex::dtype::DType::FixedSizeList(_, dim, _) => *dim as usize, + other => anyhow::bail!("expected FixedSizeList storage, got {other}"), + }; + + let elements: PrimitiveArray = fsl.elements().clone().execute(&mut ctx)?; + let slice = elements.as_slice::(); + let start = row * dim_usize; + Ok(slice[start..start + dim_usize].to_vec()) +} + +fn score_all_rows(data: &ArrayRef, query: &[f32], session: &VortexSession) -> Result> { + let num_rows = data.len(); + let query_vec = build_constant_query_vector(query, num_rows)?; + let cosine = CosineSimilarity::try_new_array(data.clone(), query_vec, num_rows) + .vortex_expect("cosine similarity accepts matching Vector inputs") + .into_array(); + + let mut ctx = session.create_execution_ctx(); + let scores: PrimitiveArray = cosine.execute(&mut ctx)?; + Ok(scores.as_slice::().to_vec()) +} + +/// Return the indices of the top-K highest scores, stable-sorted descending. +fn top_k_indices(scores: &[f32], top_k: usize) -> Vec { + let mut idx: Vec = (0..scores.len()).collect(); + idx.sort_by(|&a, &b| { + scores[b] + .partial_cmp(&scores[a]) + .unwrap_or(std::cmp::Ordering::Equal) + }); + idx.truncate(top_k); + idx +} + +#[cfg(test)] +mod tests { + use vortex_bench::SESSION; + + use super::*; + use crate::Variant; + use crate::prepare_variant; + use crate::test_utils::synthetic_vector; + + #[test] + fn uncompressed_has_perfect_self_recall() { + let dim = 128u32; + let num_rows = 64usize; + let uncompressed = synthetic_vector(dim, num_rows, 0xC0FFEE); + + let recall = measure_recall_at_k(&uncompressed, &uncompressed, 4, 10, &SESSION).unwrap(); + assert!( + (recall - 1.0).abs() < 1e-9, + "self-recall must be 1.0, got {recall}" + ); + } + + #[test] + fn turboquant_recall_is_reasonable_for_synthetic_data() { + let dim = 128u32; + let num_rows = 64usize; + let uncompressed = synthetic_vector(dim, num_rows, 0xC0FFEE); + + let prepared = crate::PreparedDataset { + name: "synthetic".to_string(), + uncompressed: uncompressed.clone(), + query: vec![], + parquet_bytes: 0, + }; + + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + let (tq_array, _) = rt + .block_on(prepare_variant( + &prepared, + Variant::VortexTurboQuant, + &SESSION, + )) + .unwrap(); + + // With only 64 random rows, recall@10 won't be 1.0 but it should be well + // above chance (10/64 ≈ 0.156). The test asserts a loose lower bound to catch + // total regressions without being flaky on distribution noise. + let recall = measure_recall_at_k(&uncompressed, &tq_array, 4, 10, &SESSION).unwrap(); + assert!( + recall >= 0.3, + "TurboQuant recall@10 on 64×128 synthetic data should be ≥0.3, got {recall}", + ); + } +} From d8a5b5b6e80e774a581be797bed47a4440bac35e Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 11 Apr 2026 13:06:56 +0000 Subject: [PATCH 06/18] ci: wire vector-search-bench into post-merge bench.yml matrix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `vector-search-bench` as a new entry in the post-merge `bench` matrix alongside `random-access-bench` and `compress-bench`. Uses the same build-bin / bench-taskset / gh-json / S3-upload pipeline so results flow into the existing benchmarks dashboard without any dashboard-side changes. The four formats passed on the CLI (`parquet,vortex-uncompressed,vortex-default,vortex-turboquant`) surface in the emitted JSON as three different `target.format` values — `Format::Parquet` for the hand-rolled baseline, `Format::OnDiskVortex` for the two lossless variants, and `Format::VortexTurboQuant` for the lossy one — so dashboards can render them as distinct bars per metric. Also promotes the `--variants` clap flag on the binary to `--formats` for naming consistency with the other standalone benchmarks (the CI script pipes `--formats ${{ matrix.benchmark.formats }}` uniformly). The underlying `Variant` enum is unchanged; only the CLI surface moved. Finally, adds `benchmarks/vector-search-bench/README.md` documenting what the benchmark measures, the four formats, local-run instructions, and — importantly — the dataset-mirror caveat. The current dataset URL points at `assets.zilliz.com` (Zilliz's public anonymous-readable bucket). Running this matrix entry on every develop merge will create recurring egress traffic on a third-party bucket; the README explains how to swap the URL in `VectorDataset::parquet_url` to an internal mirror before turning this on for a fork with heavy merge frequency. Skipping `bench-pr.yml` (PR runs) deliberately — v1 is develop-only until the mirror / egress question is resolved. Signed-off-by: Claude Signed-off-by: Connor Tsui --- .github/workflows/bench.yml | 4 ++ benchmarks/vector-search-bench/README.md | 81 ++++++++++++++++++++++ benchmarks/vector-search-bench/src/main.rs | 55 ++++++++++++--- 3 files changed, 129 insertions(+), 11 deletions(-) create mode 100644 benchmarks/vector-search-bench/README.md diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 7be829d80bd..f700aac0a78 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -48,6 +48,10 @@ jobs: name: Compression build_args: "--features lance" formats: "parquet,lance,vortex" + - id: vector-search-bench + name: Vector Similarity Search + build_args: "" + formats: "parquet,vortex-uncompressed,vortex-default,vortex-turboquant" steps: - uses: runs-on/action@v2 if: github.repository == 'vortex-data/vortex' diff --git a/benchmarks/vector-search-bench/README.md b/benchmarks/vector-search-bench/README.md new file mode 100644 index 00000000000..7a10508910f --- /dev/null +++ b/benchmarks/vector-search-bench/README.md @@ -0,0 +1,81 @@ +# vector-search-bench + +Brute-force cosine-similarity benchmark for Vortex on public VectorDBBench +embedding corpora. + +## What it measures + +For each `(dataset, format)` pair, the benchmark records four numbers: + +1. **Size** — compressed storage footprint in bytes. For the Vortex variants + that round-trip through `.vortex` files today (uncompressed & BtrBlocks + default) this is the real on-disk size. For `vortex-turboquant` it is + the in-memory `.nbytes()` footprint, because the `L2Denorm` scalar-fn + array does not yet have a concrete `serialize_metadata` implementation. +2. **Full-scan decode time** — wall time to materialize the whole `Vector` + column into a `FixedSizeListArray`. +3. **Cosine-similarity execute time** — wall time for + `CosineSimilarity(data, const_query)` executed to a materialized f32 array. +4. **Cosine-filter execute time** — wall time for the full + `Binary(Gt, [CosineSimilarity, threshold])` expression tree executed to + a `BoolArray`. + +The TurboQuant variant additionally reports **Recall@10** against the +uncompressed Vortex scan as local ground truth. Lossless variants are trivially +1.0 so they are not re-measured. + +## Formats + +- `parquet` — Parquet file read via `parquet::arrow` into an Arrow + `FixedSizeListArray`, then a hand-rolled Rust cosine loop. This is the + "what you'd do without Vortex" external floor. +- `vortex-uncompressed` — Raw `Vector` extension array, no + encoding-level compression applied. +- `vortex-default` — `BtrBlocksCompressor::default()` applied to the FSL + storage child. Generic lossless Vortex compression for float vectors. +- `vortex-turboquant` — The full + `L2Denorm(SorfTransform(FSL(Dict(codes, centroids))), norms)` pipeline. + Lossy; recall@10 is reported alongside throughput. + +## Datasets + +The first dataset wired up is **Cohere-100K** (`cohere-small`): 100K rows × +768 dims, cosine metric, ~150 MB zstd-parquet. This is the smallest +VectorDBBench-supplied embedding corpus and sits comfortably inside a CI +time / bandwidth budget. + +The upstream URL is +`https://assets.zilliz.com/benchmark/cohere_small_100k/train.parquet`. The +public Zilliz bucket is anonymous-readable so the code _can_ hit it directly. + +## Running locally + +```bash +cargo run -p vector-search-bench --release -- \ + --datasets cohere-small \ + --formats parquet,vortex-uncompressed,vortex-default,vortex-turboquant \ + --iterations 5 \ + -d table +``` + +The first run downloads the parquet file into +`vortex-bench/data/cohere-small/cohere-small.parquet` and caches it +idempotently for subsequent runs. + +## CI note: dataset mirror + +CI runs after every develop-branch merge. Hitting `assets.zilliz.com` +from every merge would create recurring egress traffic on a third-party +bucket — the same courtesy reason `RPlace` / `AirQuality` are excluded +from CI in `compress-bench`. + +Before enabling the `vector-search-bench` entry in `.github/workflows/bench.yml` +on a fork, either: + +1. **Mirror the file into an internal bucket** and swap the URL in + `vortex-bench/src/vector_dataset.rs::VectorDataset::parquet_url`, or +2. **Accept the upstream egress cost** and leave the URL as-is. + +The mirror step is a one-off `aws s3 cp` and is documented here rather +than automated in the build because the destination bucket is +organization-specific. diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs index 5a5ef57a742..27096947317 100644 --- a/benchmarks/vector-search-bench/src/main.rs +++ b/benchmarks/vector-search-bench/src/main.rs @@ -54,14 +54,13 @@ struct Args { #[arg(long, value_delimiter = ',', value_enum, default_values_t = vec![SelectableDataset::CohereSmall])] datasets: Vec, - /// Subset of variants to exercise. Defaults to all three Vortex variants. - #[arg(long, value_delimiter = ',', value_enum, default_values_t = vec![Variant::VortexUncompressed, Variant::VortexDefault, Variant::VortexTurboQuant])] - variants: Vec, - - /// Also run the Parquet-Arrow hand-rolled cosine baseline as an additional variant. - /// Default `true` — disable only when you intentionally want a Vortex-only comparison. - #[arg(long, default_value_t = true)] - parquet_baseline: bool, + /// Which benchmark variants to run, using kebab-cased labels. The `--formats` name is + /// used (instead of `--variants`) so this benchmark matches the CI invocation + /// convention shared across random-access-bench / compress-bench. Accepted values: + /// `parquet`, `vortex-uncompressed`, `vortex-default`, `vortex-turboquant`. Defaults + /// to running all four. + #[arg(long, value_delimiter = ',', value_enum, default_values_t = vec![SelectableFormat::Parquet, SelectableFormat::VortexUncompressed, SelectableFormat::VortexDefault, SelectableFormat::VortexTurboQuant])] + formats: Vec, /// Number of query rows sampled when computing Recall@K for TurboQuant. 0 disables /// the quality measurement entirely (useful for smoke tests). @@ -103,6 +102,33 @@ impl SelectableDataset { } } +#[derive(clap::ValueEnum, Clone, Copy, Debug, PartialEq, Eq)] +enum SelectableFormat { + /// Parquet-Arrow hand-rolled cosine loop baseline. + #[clap(name = "parquet")] + Parquet, + /// Raw `Vector` with no encoding compression. + #[clap(name = "vortex-uncompressed")] + VortexUncompressed, + /// BtrBlocks default-compression applied to the FSL storage child. + #[clap(name = "vortex-default")] + VortexDefault, + /// Full TurboQuant pipeline (lossy). + #[clap(name = "vortex-turboquant")] + VortexTurboQuant, +} + +impl SelectableFormat { + fn into_variant(self) -> Option { + match self { + SelectableFormat::Parquet => None, + SelectableFormat::VortexUncompressed => Some(Variant::VortexUncompressed), + SelectableFormat::VortexDefault => Some(Variant::VortexDefault), + SelectableFormat::VortexTurboQuant => Some(Variant::VortexTurboQuant), + } + } +} + #[tokio::main] async fn main() -> Result<()> { let args = Args::parse(); @@ -115,7 +141,14 @@ async fn main() -> Result<()> { .map(SelectableDataset::into_dataset) .collect(); - let total_work = datasets.len() * args.variants.len(); + let run_parquet_baseline = args.formats.contains(&SelectableFormat::Parquet); + let variants: Vec = args + .formats + .iter() + .filter_map(|f| f.into_variant()) + .collect(); + + let total_work = datasets.len() * args.formats.len(); let progress = ProgressBar::new(total_work as u64); let mut timings: Vec = Vec::new(); @@ -135,7 +168,7 @@ async fn main() -> Result<()> { // Parquet-Arrow baseline. Emitted as a separate pseudo-variant with label // `parquet` / Format::Parquet so it shows up in dashboards next to the Vortex // variants. - if args.parquet_baseline { + if run_parquet_baseline { let parquet_path = dataset.to_parquet_path().await?; let baseline_timings = run_parquet_baseline_timings( &parquet_path, @@ -170,7 +203,7 @@ async fn main() -> Result<()> { }); } - for &variant in &args.variants { + for &variant in &variants { let (variant_array, size_bytes) = prepare_variant(&prepared, variant, &SESSION).await?; let variant_label = variant.label(); From b286070eff4d06533c6ca927635273ba74e83386 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 11 Apr 2026 13:15:00 +0000 Subject: [PATCH 07/18] vortex-bench: add Cohere-medium, OpenAI, Bioasq, Glove datasets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends `VectorDataset` with five additional cosine-metric corpora from the VectorDBBench collection: - CohereMedium: 1M × 768d (wiki-22-12) - OpenAiSmall: 50K × 1536d (OpenAI-on-C4) - OpenAiMedium: 500K × 1536d (OpenAI-on-C4) - BioasqMedium: 1M × 1024d (biomedical) - GloveMedium: 1M × 200d (word embeddings) All six built-in datasets are exported via a new `ALL_VECTOR_DATASETS` slice and exposed through the `vector-search-bench` `--datasets` flag so local runs can pick any of them. Adds an `all_datasets_have_consistent_metadata` test that verifies every built-in variant has a unique kebab-cased name, a `train.parquet` URL under `assets.zilliz.com/benchmark/`, a dimension above the TurboQuant minimum (128), and cosine metric (v1 only wires cosine). L2 / IP datasets (SIFT, GIST, LAION) remain out of scope until vortex-tensor gains an L2-distance scalar function. Signed-off-by: Claude Signed-off-by: Connor Tsui --- benchmarks/vector-search-bench/src/main.rs | 15 +++ vortex-bench/src/vector_dataset.rs | 106 ++++++++++++++++++++- 2 files changed, 116 insertions(+), 5 deletions(-) diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs index 27096947317..7d65e29598f 100644 --- a/benchmarks/vector-search-bench/src/main.rs +++ b/benchmarks/vector-search-bench/src/main.rs @@ -92,12 +92,27 @@ struct Args { enum SelectableDataset { #[clap(name = "cohere-small")] CohereSmall, + #[clap(name = "cohere-medium")] + CohereMedium, + #[clap(name = "openai-small")] + OpenAiSmall, + #[clap(name = "openai-medium")] + OpenAiMedium, + #[clap(name = "bioasq-medium")] + BioasqMedium, + #[clap(name = "glove-medium")] + GloveMedium, } impl SelectableDataset { fn into_dataset(self) -> VectorDataset { match self { SelectableDataset::CohereSmall => VectorDataset::CohereSmall, + SelectableDataset::CohereMedium => VectorDataset::CohereMedium, + SelectableDataset::OpenAiSmall => VectorDataset::OpenAiSmall, + SelectableDataset::OpenAiMedium => VectorDataset::OpenAiMedium, + SelectableDataset::BioasqMedium => VectorDataset::BioasqMedium, + SelectableDataset::GloveMedium => VectorDataset::GloveMedium, } } } diff --git a/vortex-bench/src/vector_dataset.rs b/vortex-bench/src/vector_dataset.rs index d6a97d20ba9..86086ac57ac 100644 --- a/vortex-bench/src/vector_dataset.rs +++ b/vortex-bench/src/vector_dataset.rs @@ -37,30 +37,77 @@ use crate::idempotent_async; /// /// Each variant is one of the canonical VectorDBBench corpora, distributed as parquet under /// the Zilliz public benchmark bucket. The smaller `*Small` sizes are appropriate for CI -/// runs; the larger sizes are intended for local / on-demand experiments. +/// runs; the larger sizes are intended for local / on-demand experiments. Only +/// cosine-metric datasets are wired today — SIFT / GIST / LAION (L2) will follow when an +/// L2-distance scalar function lands in `vortex-tensor`. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum VectorDataset { /// Cohere wiki-22-12, 100K rows × 768 dims, cosine metric. ~307 MB raw / ~150 MB /// zstd-parquet — the default CI-friendly size. CohereSmall, + /// Cohere wiki-22-12, 1M rows × 768 dims, cosine metric. ~3 GB raw / ~1.5 GB + /// zstd-parquet. Local-only by default. + CohereMedium, + /// OpenAI embeddings on C4, 50K rows × 1536 dims, cosine metric. ~307 MB raw — + /// the smallest OpenAI variant and comparable in size to Cohere-small, but with + /// double the dimensionality. + OpenAiSmall, + /// OpenAI embeddings on C4, 500K rows × 1536 dims, cosine metric. ~3 GB raw. + /// Local-only by default. + OpenAiMedium, + /// Bioasq biomedical embeddings, 1M rows × 1024 dims, cosine metric. ~4 GB raw. + /// Local-only by default. + BioasqMedium, + /// Glove word embeddings, 1M rows × 200 dims, cosine metric. ~800 MB raw. + GloveMedium, } +/// All built-in [`VectorDataset`] variants in a fixed order. Convenient for iterating or +/// for listing choices in CLI help. +pub const ALL_VECTOR_DATASETS: &[VectorDataset] = &[ + VectorDataset::CohereSmall, + VectorDataset::CohereMedium, + VectorDataset::OpenAiSmall, + VectorDataset::OpenAiMedium, + VectorDataset::BioasqMedium, + VectorDataset::GloveMedium, +]; + impl VectorDataset { /// The upstream URL for this dataset's canonical train-split parquet file. /// - /// **CI note**: point at an internal mirror before enabling this benchmark in CI. + /// **CI note**: point at an internal mirror before enabling this benchmark in CI — + /// see `benchmarks/vector-search-bench/README.md` for the procedure. pub fn parquet_url(&self) -> &'static str { match self { VectorDataset::CohereSmall => { "https://assets.zilliz.com/benchmark/cohere_small_100k/train.parquet" } + VectorDataset::CohereMedium => { + "https://assets.zilliz.com/benchmark/cohere_medium_1m/train.parquet" + } + VectorDataset::OpenAiSmall => { + "https://assets.zilliz.com/benchmark/openai_small_50k/train.parquet" + } + VectorDataset::OpenAiMedium => { + "https://assets.zilliz.com/benchmark/openai_medium_500k/train.parquet" + } + VectorDataset::BioasqMedium => { + "https://assets.zilliz.com/benchmark/bioasq_medium_1m/train.parquet" + } + VectorDataset::GloveMedium => { + "https://assets.zilliz.com/benchmark/glove_medium_1m/train.parquet" + } } } /// Fixed vector dimensionality for this dataset. pub fn dim(&self) -> u32 { match self { - VectorDataset::CohereSmall => 768, + VectorDataset::CohereSmall | VectorDataset::CohereMedium => 768, + VectorDataset::OpenAiSmall | VectorDataset::OpenAiMedium => 1536, + VectorDataset::BioasqMedium => 1024, + VectorDataset::GloveMedium => 200, } } @@ -68,14 +115,25 @@ impl VectorDataset { pub fn num_rows(&self) -> usize { match self { VectorDataset::CohereSmall => 100_000, + VectorDataset::CohereMedium => 1_000_000, + VectorDataset::OpenAiSmall => 50_000, + VectorDataset::OpenAiMedium => 500_000, + VectorDataset::BioasqMedium => 1_000_000, + VectorDataset::GloveMedium => 1_000_000, } } /// The distance metric the upstream dataset was curated for. v1 only wires cosine, so - /// this is informational today. + /// every built-in dataset returns [`VectorMetric::Cosine`]. The enum variant exists so + /// that L2 / inner-product datasets can be added later without a breaking change. pub fn metric(&self) -> VectorMetric { match self { - VectorDataset::CohereSmall => VectorMetric::Cosine, + VectorDataset::CohereSmall + | VectorDataset::CohereMedium + | VectorDataset::OpenAiSmall + | VectorDataset::OpenAiMedium + | VectorDataset::BioasqMedium + | VectorDataset::GloveMedium => VectorMetric::Cosine, } } } @@ -98,6 +156,11 @@ impl Dataset for VectorDataset { fn name(&self) -> &str { match self { VectorDataset::CohereSmall => "cohere-small", + VectorDataset::CohereMedium => "cohere-medium", + VectorDataset::OpenAiSmall => "openai-small", + VectorDataset::OpenAiMedium => "openai-medium", + VectorDataset::BioasqMedium => "bioasq-medium", + VectorDataset::GloveMedium => "glove-medium", } } @@ -142,6 +205,9 @@ impl Dataset for VectorDataset { #[cfg(test)] mod tests { + use vortex::utils::aliases::hash_set::HashSet; + + use super::ALL_VECTOR_DATASETS; use super::VectorDataset; use super::VectorMetric; use crate::datasets::Dataset; @@ -156,4 +222,34 @@ mod tests { assert!(ds.parquet_url().ends_with("/train.parquet")); assert!(ds.parquet_url().contains("cohere_small_100k")); } + + #[test] + fn all_datasets_have_consistent_metadata() { + // Every built-in dataset must have a unique kebab-cased name, point at a + // `train.parquet` file under `assets.zilliz.com/benchmark/`, declare a + // dimension ≥ `MIN_DIMENSION` for TurboQuant, a non-zero row count, and + // (for v1) cosine metric. + let mut seen_names: HashSet = HashSet::default(); + for &ds in ALL_VECTOR_DATASETS { + let name = ds.name(); + assert!( + seen_names.insert(name.to_string()), + "duplicate dataset name {name}", + ); + assert!( + ds.dim() >= 128, + "{name} dim {} below TurboQuant minimum", + ds.dim() + ); + assert!(ds.num_rows() > 0, "{name} has zero rows"); + assert_eq!( + ds.metric(), + VectorMetric::Cosine, + "{name} must be cosine for v1" + ); + let url = ds.parquet_url(); + assert!(url.starts_with("https://assets.zilliz.com/benchmark/")); + assert!(url.ends_with("/train.parquet")); + } + } } From 616acdcba688fc1ccf9b0391072cbdd31fb556b3 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 11 Apr 2026 13:27:06 +0000 Subject: [PATCH 08/18] vector-search-bench: handle ListView, canonicalize chunks, add synth generator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes two issues discovered while running the benchmark end-to-end against a real parquet file in a no-network environment: 1. `parquet_to_vortex_chunks` produces `ListView` arrays for list columns by default, so `list_to_vector_ext` was rejecting them with "expects a List array, got list(f32)". Added a fast path that applies `recursive_list_from_list_view` before the offset-stride validation kicks in. 2. `list_to_vector_ext` returns a `ChunkedArray>` when given a chunked input (the normal shape after parquet ingest), but downstream `extract_query_row` expects a single non-chunked `Extension`. `prepare_dataset` now calls `wrapped.execute::(&mut ctx)` to materialize the chunked wrapper into a single unchunked extension array. Also adds a new `gen_synthetic_dataset` helper binary that writes a VectorDBBench-shape parquet file (`id: int64`, `emb: list`, zstd-compressed) with deterministic xorshift-generated vectors. This is useful for: - Local dev runs of `vector-search-bench` without needing outbound network access to `assets.zilliz.com`. - Populating the `vortex-bench/data//.parquet` cache path so the benchmark's idempotent download step skips the HTTP fetch. - Sanity-checking the benchmark pipeline in CI environments that block outbound HTTPS to third-party buckets. Verified end-to-end against 5000×768 synthetic Cohere-small and 5000×1536 synthetic OpenAI-small inputs — every measurement pipeline (size, decode, cosine, filter, recall@10) produces valid gh-json and TurboQuant recall@10 lands at 0.91 on both fixtures, consistent with the loose recall floor the unit test enforces. Signed-off-by: Claude Signed-off-by: Connor Tsui --- Cargo.lock | 1 + benchmarks/vector-search-bench/Cargo.toml | 1 + .../src/bin/gen_synthetic_dataset.rs | 139 ++++++++++++++++++ benchmarks/vector-search-bench/src/lib.rs | 11 +- vortex-bench/src/conversions.rs | 9 ++ 5 files changed, 160 insertions(+), 1 deletion(-) create mode 100644 benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs diff --git a/Cargo.lock b/Cargo.lock index c56f8e5330e..ebc8530825a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10058,6 +10058,7 @@ version = "0.1.0" dependencies = [ "anyhow", "arrow-array 58.0.0", + "arrow-buffer 58.0.0", "arrow-schema 58.0.0", "async-trait", "clap", diff --git a/benchmarks/vector-search-bench/Cargo.toml b/benchmarks/vector-search-bench/Cargo.toml index 068cf400d71..8e8101ddcee 100644 --- a/benchmarks/vector-search-bench/Cargo.toml +++ b/benchmarks/vector-search-bench/Cargo.toml @@ -17,6 +17,7 @@ publish = false [dependencies] anyhow = { workspace = true } arrow-array = { workspace = true } +arrow-buffer = { workspace = true } arrow-schema = { workspace = true } async-trait = { workspace = true } clap = { workspace = true, features = ["derive"] } diff --git a/benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs b/benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs new file mode 100644 index 00000000000..3f5500edfcf --- /dev/null +++ b/benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Write a synthetic parquet file matching the VectorDBBench `emb: list` + `id: int64` +//! schema. Useful for local dev runs of `vector-search-bench` without needing network +//! access to `assets.zilliz.com`, and for sandbox / CI environments that block outbound +//! HTTPS. +//! +//! The generated file is bit-identical across runs for a given `(num_rows, dim, seed)` +//! triple so that downstream benchmark output is reproducible. +//! +//! Example: +//! +//! ```bash +//! cargo run -p vector-search-bench --bin gen-synthetic-dataset --release -- \ +//! --num-rows 5000 \ +//! --dim 768 \ +//! --out vortex-bench/data/cohere-small/cohere-small.parquet +//! ``` +//! +//! After running this, `vector-search-bench --datasets cohere-small` will find the +//! cached parquet file and skip the HTTP download via `idempotent_async`. + +use std::fs::File; +use std::path::PathBuf; +use std::sync::Arc; + +use anyhow::Context; +use anyhow::Result; +use arrow_array::Int64Array; +use arrow_array::ListArray; +use arrow_array::RecordBatch; +use arrow_array::builder::Float32Builder; +use arrow_array::builder::Int32BufferBuilder; +use arrow_schema::DataType; +use arrow_schema::Field; +use arrow_schema::Schema; +use clap::Parser; +use parquet::arrow::ArrowWriter; +use parquet::basic::Compression; +use parquet::file::properties::WriterProperties; + +#[derive(Parser, Debug)] +#[command( + version, + about = "Generate a synthetic VectorDBBench-style parquet file" +)] +struct Args { + /// Number of rows to generate. + #[arg(long, default_value_t = 5000)] + num_rows: usize, + + /// Vector dimensionality. Must be ≥ 128 to exercise TurboQuant. + #[arg(long, default_value_t = 768)] + dim: u32, + + /// Deterministic PRNG seed — changing this changes the generated vectors. + #[arg(long, default_value_t = 0xC0FFEE)] + seed: u64, + + /// Output parquet file path. Parent directory is created if missing. + #[arg(long)] + out: PathBuf, +} + +fn main() -> Result<()> { + let args = Args::parse(); + + if let Some(parent) = args.out.parent() { + std::fs::create_dir_all(parent)?; + } + + // Build an Arrow `ListArray` so the schema matches VectorDBBench's `emb: + // list` (note: NOT fixed_size_list — parquet has no FSL logical type so + // arrow-rs writes lists). Every list has exactly `dim` elements. + let dim_usize = args.dim as usize; + let total_elements = args.num_rows * dim_usize; + + let mut float_values = Float32Builder::with_capacity(total_elements); + let mut offsets = Int32BufferBuilder::new(args.num_rows + 1); + offsets.append(0i32); + + let mut state = args.seed.wrapping_add(1); + for row in 0..args.num_rows { + for i in 0..dim_usize { + // Deterministic xorshift mixed with position so every vector is distinct. + state ^= state << 13; + state ^= state >> 7; + state ^= state << 17; + let scale = 1.0f32 / 32768.0; + let v = ((state & 0xFFFF) as f32 * scale - 0.5) + + ((row as f32 * 0.00013) + (i as f32 * 0.00007)).sin() * 0.25; + float_values.append_value(v); + } + let written = i32::try_from((row + 1) * dim_usize) + .context("offset overflows i32 — reduce num_rows or dim")?; + offsets.append(written); + } + + let values_array = float_values.finish(); + let offsets_buffer = offsets.finish(); + + let field = Arc::new(Field::new("item", DataType::Float32, false)); + let list_dtype = DataType::List(Arc::clone(&field)); + let list_array = ListArray::try_new( + Arc::clone(&field), + arrow_buffer::OffsetBuffer::new(offsets_buffer.into()), + Arc::new(values_array), + None, + )?; + + let ids: Int64Array = (0..args.num_rows as i64).collect(); + + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("emb", list_dtype, false), + ])); + + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(ids), Arc::new(list_array)], + )?; + + let writer_props = WriterProperties::builder() + .set_compression(Compression::ZSTD(Default::default())) + .build(); + let file = File::create(&args.out)?; + let mut writer = ArrowWriter::try_new(file, schema, Some(writer_props))?; + writer.write(&batch)?; + writer.close()?; + + println!( + "wrote {} rows × {} dims to {}", + args.num_rows, + args.dim, + args.out.display() + ); + Ok(()) +} diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs index 27f61f39ed7..39a780875d1 100644 --- a/benchmarks/vector-search-bench/src/lib.rs +++ b/benchmarks/vector-search-bench/src/lib.rs @@ -152,6 +152,8 @@ impl PreparedDataset { /// Prepare a dataset by downloading its parquet file, converting the `emb` column to a /// `Vector` extension array, and extracting a single-row query vector. pub async fn prepare_dataset(dataset: &VectorDataset) -> Result { + use vortex::array::arrays::ExtensionArray; + let parquet_path = dataset .to_parquet_path() .await @@ -171,7 +173,14 @@ pub async fn prepare_dataset(dataset: &VectorDataset) -> Result let struct_array = chunked.into_array(); let emb_column = extract_emb_column(&struct_array)?; - let uncompressed = list_to_vector_ext(emb_column)?; + let wrapped = list_to_vector_ext(emb_column)?; + + // `list_to_vector_ext` may return a chunked `Extension` when the source was + // a `ChunkedArray` of list columns (the usual shape after `parquet_to_vortex_chunks`). + // Materialize it into a single non-chunked `ExtensionArray` so downstream code can + // treat it uniformly. + let mut ctx = SESSION.create_execution_ctx(); + let uncompressed = wrapped.execute::(&mut ctx)?.into_array(); let query = extract_query_row(&uncompressed, DEFAULT_QUERY_ROW)?; diff --git a/vortex-bench/src/conversions.rs b/vortex-bench/src/conversions.rs index 13e295e13c4..e31c3520d2f 100644 --- a/vortex-bench/src/conversions.rs +++ b/vortex-bench/src/conversions.rs @@ -26,8 +26,10 @@ use vortex::array::arrays::ChunkedArray; use vortex::array::arrays::ExtensionArray; use vortex::array::arrays::FixedSizeListArray; use vortex::array::arrays::List; +use vortex::array::arrays::ListView; use vortex::array::arrays::chunked::ChunkedArrayExt; use vortex::array::arrays::list::ListArrayExt; +use vortex::array::arrays::listview::recursive_list_from_list_view; use vortex::array::arrow::FromArrowArray; use vortex::array::builders::builder_with_capacity; use vortex::array::extension::EmptyMetadata; @@ -267,6 +269,13 @@ pub fn list_to_vector_ext(input: ArrayRef) -> VortexResult { return Ok(ChunkedArray::from_iter(converted).into_array()); } + // `parquet_to_vortex_chunks` produces `ListView` arrays for list columns by default; + // materialize them into a flat `List` representation before we validate offsets. + if input.as_opt::().is_some() { + let flat = recursive_list_from_list_view(input)?; + return list_to_vector_ext(flat); + } + let Some(list) = input.as_opt::() else { vortex_bail!( "list_to_vector_ext expects a List array, got dtype {}", From a12f0d054950c73dc8941d5d6df485ff65520615 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 11 Apr 2026 14:07:12 +0000 Subject: [PATCH 09/18] vector-search-bench: verify correctness + fix misleading size / decompress measurements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A thorough audit of the benchmark's outputs found three issues that together made the results hard to interpret: 1. **"Size" was lying.** The old `measure_on_disk_size` helper wrote every variant through `SESSION.write_options()`, which uses `WriteStrategyBuilder::default()` — and that default applies BtrBlocks compression regardless of the in-memory tree. So both `vortex-uncompressed` and `vortex-default` ended up writing identical ALP-RD-compressed bytes to disk, reporting the same "size" number even though the two in-memory trees were different. Switched to reporting `ArrayRef::nbytes()` of the in-memory variant tree instead — the honest footprint of what the compute measurements actually operate on. Running the diagnostic `print_variant_trees` test now shows what each variant really is: ``` VortexUncompressed nbytes=15,360,000 Extension → FSL → Primitive VortexDefault nbytes=13,072,050 Extension → FSL → alprd(f32) [bitpacked u16/u32 + patches] VortexTurboQuant nbytes= 5,141,024 L2Denorm → SorfTransform → FSL → Dict(u8 codes + f32[256]) ``` So BtrBlocks really does find a ~15% lossless saving on random f32 via ALP-RD, and the previously-observed 3.5× cosine slowdown is the real cost of decoding ALP-RD on the fly inside `CosineSimilarity`. 2. **"Decompress time" for the compressed variants was a no-op.** The old `decode_full_scan` only executed the Extension shell and the FSL storage; it never forced the element buffer to materialize. Since FSL canonical form can keep its elements in their encoded representation, this reported ~60 ns for `vortex-default` even though reading any value would actually have to run the ALP-RD unpacking. Renamed to `decompress_full_scan` and added a final `fsl.elements().execute::()` call that forces the lazy decode to completion. `vortex-default`'s decompress time now shows ~6 ms on 5000×768 rows — the actual ALP-RD cost. 3. **No correctness verification.** The benchmark reported throughput numbers with no guarantee that the variants were producing the same answers. A new `verify.rs` module now computes cosine-similarity scores for one query against every variant *before* timing, compares them to the uncompressed baseline, and bails the run if any lossless variant drifts beyond 1e-4 max-abs-diff or any lossy variant drifts beyond 0.2 max-abs-diff. The `execute_cosine` implementation is promoted to `pub` and shared between the timing loop and verification so both paths exercise the exact same expression tree. Also adds a new `compress time` measurement alongside `decompress time`, so the dashboard can show the full round-trip cost per variant (previously we only timed one side). The TurboQuant verification is emitted as a warning instead of a hard error — the lossy tolerance is a quality-budget alert, not a correctness invariant. New unit tests (13 total in vector-search-bench, up from 4): - `verify::*` — 8 tests covering the `compare_scores`, `verify_scores`, and `verify_variant` paths including NaN handling and end-to-end lossless/lossy checks on synthetic 64×128 data. - `tests::uncompressed_decompress_is_fast` — regression guard ensuring the uncompressed variant's decompress pass is >5× faster than TurboQuant's so future refactors can't accidentally make the "uncompressed" path take the slow path. - `tests::print_variant_trees` — `#[ignore]` diagnostic test that dumps `display_tree_encodings_only()` for each variant. This is what was used to root-cause the size-lying issue. Verified end-to-end on synthetic 5000×768 Cohere-small: parquet nbytes=14,511,306 cosine= 2.9 ms correctness=0.00e0 vortex-uncompressed nbytes=15,360,000 cosine= 7.9 ms correctness=0.00e0 vortex-default nbytes=13,072,050 cosine=20.5 ms correctness=0.00e0 vortex-turboquant nbytes= 5,141,024 cosine=59.1 ms correctness=5.18e-3 recall@10 (tq) 0.9150 All lossless variants now produce bit-identical cosine scores (max diff exactly 0.0) against the uncompressed baseline, confirming the ALP-RD and hand-rolled parquet paths are correct. TurboQuant's 5.18e-3 drift is well within the 0.2 lossy tolerance. Signed-off-by: Claude Signed-off-by: Connor Tsui --- benchmarks/vector-search-bench/src/lib.rs | 281 +++++++++----- benchmarks/vector-search-bench/src/main.rs | 127 ++++++- .../src/parquet_baseline.rs | 6 +- benchmarks/vector-search-bench/src/recall.rs | 36 +- benchmarks/vector-search-bench/src/verify.rs | 344 ++++++++++++++++++ 5 files changed, 667 insertions(+), 127 deletions(-) create mode 100644 benchmarks/vector-search-bench/src/verify.rs diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs index 39a780875d1..fe27400dce6 100644 --- a/benchmarks/vector-search-bench/src/lib.rs +++ b/benchmarks/vector-search-bench/src/lib.rs @@ -3,27 +3,38 @@ //! Vector similarity-search benchmark core. //! -//! This crate measures four quantities for each `(dataset, variant)` pair: +//! For each `(dataset, variant)` pair we report: //! -//! 1. **Compressed storage size** (bytes on disk, or in-memory `.nbytes()` for variants that -//! don't yet serialize — currently just [`Variant::VortexTurboQuant`]). -//! 2. **Full-scan decode time** — executing the `Vector` column into a -//! materialized [`vortex::array::arrays::FixedSizeListArray`]. -//! 3. **Cosine-similarity execute time** — executing -//! `CosineSimilarity(data, const_query)` into a materialized f32 primitive array. -//! 4. **Filter execute time** — executing -//! `Binary(Gt, [CosineSimilarity, threshold])` into a -//! [`vortex::array::arrays::BoolArray`]. +//! - **In-memory size** — `ArrayRef::nbytes()` of the prepared variant tree. This is the +//! memory footprint you'd pay to keep that encoding resident. +//! - **Compress time** — the wall time to build the variant tree from the materialized +//! uncompressed source (0 for the uncompressed variant itself, the BtrBlocks pass for +//! `vortex-default`, the full L2Denorm+SORF+quantize pipeline for `vortex-turboquant`). +//! - **Decompress time** — the wall time to execute the variant tree back into a +//! canonical `FixedSizeListArray` (≈0 for the already-canonical uncompressed variant, +//! meaningful for the compressed variants). +//! - **Cosine time** — executing `CosineSimilarity(data, const_query)` to a materialized +//! f32 primitive array. +//! - **Filter time** — executing `Binary(Gt, [cosine, threshold])` to a `BoolArray`. +//! - **Recall@10** (for the lossy TurboQuant variant only) against exact top-10 from the +//! uncompressed variant. //! -//! Measurements are emitted via the existing `vortex_bench::measurements` types so that -//! the benchmark results flow through the standard `gh-json` pipeline and appear in the -//! CI dashboard alongside compress-bench / random-access-bench results. +//! Before any timing begins, the benchmark also runs a **correctness verification** pass +//! via [`verify`]: for every variant it computes cosine scores for a single query and +//! compares them to the ground-truth scores from the uncompressed variant. Lossless +//! variants must match within [`verify::LOSSLESS_TOLERANCE`]; lossy variants must match +//! within [`verify::LOSSY_TOLERANCE`]. A correctness failure bails the run. +//! +//! Measurements are emitted via the existing `vortex_bench::measurements` types so +//! results flow through the standard `gh-json` pipeline and show up on the CI dashboard +//! alongside compress-bench / random-access-bench. use std::time::Duration; use std::time::Instant; pub mod parquet_baseline; pub mod recall; +pub mod verify; use anyhow::Context; use anyhow::Result; @@ -260,67 +271,82 @@ fn extract_query_row(vector_ext: &ArrayRef, row: usize) -> Result> { Ok(slice[start..start + dim_usize].to_vec()) } -/// Apply a `Variant`'s preparation strategy to the uncompressed Vortex array and return the -/// prepared array together with its reported size in bytes. For serializable variants the -/// size is the number of bytes written to a `.vortex` file; for in-memory-only variants -/// (TurboQuant) it's the live `.nbytes()` footprint. -pub async fn prepare_variant( +/// A prepared variant: the in-memory array tree plus the metadata we want to report +/// alongside it (size and construction cost). +#[derive(Debug, Clone)] +pub struct PreparedVariant { + /// The variant's in-memory array tree. For the uncompressed variant this is the same + /// canonical `Extension` pulled out of `prepare_dataset`; for the others it's + /// the output of the respective compression pipeline. + pub array: ArrayRef, + /// Summed byte footprint of the variant tree — `ArrayRef::nbytes()`. This is the + /// in-memory cost of keeping the variant resident, not a disk size. + pub nbytes: u64, + /// Wall time spent constructing the variant tree from the already-materialized + /// uncompressed source. 0 for [`Variant::VortexUncompressed`]; meaningful for the + /// compressed variants. + pub compress_duration: Duration, +} + +/// Apply a `Variant`'s preparation strategy to the materialized uncompressed source and +/// return the resulting tree together with its reported in-memory size and construction +/// time. +/// +/// **Why nbytes instead of on-disk size?** The Vortex file writer applies BtrBlocks +/// compression as part of its default write strategy regardless of the in-memory tree +/// shape, so serializing an "uncompressed" tree and measuring the resulting `.vortex` +/// file produces the same bytes as serializing a `BtrBlocksCompressor::default()`- +/// compressed tree — the disk-size comparison collapses two conceptually different +/// things into one number. Reporting `nbytes()` of the in-memory tree keeps the size +/// measurement consistent with what the *compute* measurements operate on. +pub fn prepare_variant( prepared: &PreparedDataset, variant: Variant, session: &VortexSession, -) -> Result<(ArrayRef, u64)> { +) -> Result { match variant { Variant::VortexUncompressed => { + // Identity: the uncompressed Extension is already materialized. Still + // record a dummy Instant so the timing point has a well-defined value even + // if it's effectively zero. + let start = Instant::now(); let array = prepared.uncompressed.clone(); - let size = - measure_on_disk_size(&array, session, &prepared.name, "uncompressed").await?; - Ok((array, size)) + let compress_duration = start.elapsed(); + let nbytes = array.nbytes(); + Ok(PreparedVariant { + array, + nbytes, + compress_duration, + }) } Variant::VortexDefault => { + let start = Instant::now(); let array = BtrBlocksCompressor::default().compress(&prepared.uncompressed)?; - let size = measure_on_disk_size(&array, session, &prepared.name, "default").await?; - Ok((array, size)) + let compress_duration = start.elapsed(); + let nbytes = array.nbytes(); + Ok(PreparedVariant { + array, + nbytes, + compress_duration, + }) } Variant::VortexTurboQuant => { let mut ctx = session.create_execution_ctx(); + let start = Instant::now(); let array = compress_turboquant(prepared.uncompressed.clone(), &mut ctx)?; - // TurboQuant cannot yet round-trip through a Vortex file (L2Denorm metadata - // serialization is not implemented). Report the in-memory `.nbytes()` footprint - // as a proxy. Document this in the benchmark output so consumers of the - // dashboard aren't misled. - let size = array.nbytes() as u64; - Ok((array, size)) + let compress_duration = start.elapsed(); + let nbytes = array.nbytes(); + Ok(PreparedVariant { + array, + nbytes, + compress_duration, + }) } } } -/// Serialize a prepared Vortex array to a temporary `.vortex` file and return its length. -/// This is what we report as the "compressed size" for serializable variants; it matches -/// the semantics of `compress-bench` which reports the on-disk parquet/vortex file size. -async fn measure_on_disk_size( - array: &ArrayRef, - session: &VortexSession, - dataset_name: &str, - variant_label: &str, -) -> Result { - use vortex::file::WriteOptionsSessionExt; - - let tmp_dir = std::env::temp_dir().join("vortex-vector-search-bench"); - tokio::fs::create_dir_all(&tmp_dir).await?; - let tmp_path = tmp_dir.join(format!("{dataset_name}-{variant_label}.vortex")); - - let mut file = tokio::fs::File::create(&tmp_path).await?; - session - .write_options() - .write(&mut file, array.clone().to_array_stream()) - .await?; - - let metadata = tokio::fs::metadata(&tmp_path).await?; - Ok(metadata.len()) -} - -/// Run the decode / cosine / filter microbenchmarks against a prepared variant array and -/// return the best-of-`iterations` wall times for each measurement. +/// Run the decompress / cosine / filter microbenchmarks against a prepared variant +/// array and return the best-of-`iterations` wall times for each measurement. pub fn run_timings( variant_array: &ArrayRef, query: &[f32], @@ -329,15 +355,15 @@ pub fn run_timings( ) -> Result { let _ = QueryLen::default; // touch the type alias so rustc doesn't warn - let mut decode = Duration::MAX; + let mut decompress = Duration::MAX; let mut cosine = Duration::MAX; let mut filter = Duration::MAX; for _ in 0..iterations { let mut ctx = session.create_execution_ctx(); let start = Instant::now(); - let decoded: FixedSizeListArray = decode_full_scan(variant_array, &mut ctx)?; - decode = decode.min(start.elapsed()); + let decoded: FixedSizeListArray = decompress_full_scan(variant_array, &mut ctx)?; + decompress = decompress.min(start.elapsed()); drop(decoded); } @@ -358,7 +384,7 @@ pub fn run_timings( } Ok(VariantTimings { - decode, + decompress, cosine, filter, }) @@ -367,33 +393,54 @@ pub fn run_timings( /// Timing summary for one `(dataset, variant)` pair. #[derive(Debug, Clone, Copy)] pub struct VariantTimings { - /// Wall time for a full column decode. - pub decode: Duration, - /// Wall time for the cosine_similarity scalar-function execution. + /// Wall time to execute the variant's array tree back into a canonical + /// `FixedSizeListArray`. ~0 for [`Variant::VortexUncompressed`] (the tree is already + /// canonical), meaningful for the two compressed variants. + pub decompress: Duration, + /// Wall time for the cosine_similarity scalar-function execution over the whole + /// column (materialized into an `f32` [`PrimitiveArray`]). pub cosine: Duration, - /// Wall time for the full `Binary(Gt, [cosine, threshold])` expression. + /// Wall time for the full `Binary(Gt, [cosine, threshold])` expression executed + /// into a [`BoolArray`]. pub filter: Duration, } /// Fully materialize the input column so the measurement captures *all* decompression -/// work — the extension shell, the FSL storage, and the inner element buffer. +/// work — the extension shell, the FSL storage, **and the inner f32 element buffer**. +/// +/// Forcing the element buffer to materialize as a canonical `PrimitiveArray` is +/// what distinguishes this from a no-op cache hit. Executing the `ExtensionArray` or +/// `FixedSizeListArray` alone only unwraps the container shells — if the FSL's +/// `elements` child is (e.g.) an `alprd` tree, the bit-unpacking is lazy and only +/// happens when something reads the values. The `execute::` call below +/// forces that read. /// /// For the Vortex-uncompressed variant this is cheap (bitwise copy / no-op). For -/// BtrBlocks-default it includes FSL decompression. For TurboQuant it includes running -/// the inverse SORF rotation + dictionary lookup through the scalar-fn pipeline. -fn decode_full_scan( +/// BtrBlocks-default it includes the ALP-RD decoding pass. For TurboQuant it includes +/// running the inverse SORF rotation + dictionary lookup through the scalar-fn +/// pipeline. +pub fn decompress_full_scan( array: &ArrayRef, ctx: &mut vortex::array::ExecutionCtx, ) -> Result { use vortex::array::arrays::ExtensionArray; use vortex::array::arrays::extension::ExtensionArrayExt; + use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt; let ext: ExtensionArray = array.clone().execute(ctx)?; let fsl: FixedSizeListArray = ext.storage_array().clone().execute(ctx)?; + // Force the element buffer all the way down to a canonical PrimitiveArray so the + // timing captures any lazy decode work (ALP-RD bit unpacking, dict lookups, SORF + // inverse rotation, etc.). + let elements: PrimitiveArray = fsl.elements().clone().execute(ctx)?; + drop(elements); Ok(fsl) } -fn execute_cosine( +/// Execute `CosineSimilarity(data, broadcast(query))` to a materialized `f32` +/// [`PrimitiveArray`]. Shared between the timing loop and the correctness-verification +/// path so both exercise the exact same expression tree. +pub fn execute_cosine( data: &ArrayRef, query: &[f32], ctx: &mut vortex::array::ExecutionCtx, @@ -494,24 +541,96 @@ mod tests { Variant::VortexDefault, Variant::VortexTurboQuant, ] { - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - let (array, size) = rt - .block_on(prepare_variant(&prepared, variant, &SESSION)) - .unwrap(); + let prep = prepare_variant(&prepared, variant, &SESSION).unwrap(); assert_eq!( - array.len(), + prep.array.len(), num_rows, "variant {variant:?} changed row count" ); - assert!(size > 0, "variant {variant:?} reported zero size"); + assert!(prep.nbytes > 0, "variant {variant:?} reported zero size"); - let timings = run_timings(&array, &prepared.query, 2, &SESSION).unwrap(); - assert!(timings.decode > Duration::ZERO); + let timings = run_timings(&prep.array, &prepared.query, 2, &SESSION).unwrap(); + // TurboQuant + default must do real work; uncompressed's decompress is a + // no-op and can plausibly time as zero. assert!(timings.cosine > Duration::ZERO); assert!(timings.filter > Duration::ZERO); } } + + /// The **uncompressed** variant's decompress pass must be a no-op (the tree is + /// already canonical), while TurboQuant must do real work. This is a regression + /// guard for a future change accidentally making the uncompressed variant take the + /// slow path. + #[test] + fn uncompressed_decompress_is_fast() { + let dim = 128u32; + let num_rows = 256usize; + let uncompressed = synthetic_vector(dim, num_rows, 0xDEADBEEF); + + let prepared = PreparedDataset { + name: "synthetic".to_string(), + uncompressed, + query: vec![0.1f32; dim as usize], + parquet_bytes: 0, + }; + + let uncompressed_prep = + prepare_variant(&prepared, Variant::VortexUncompressed, &SESSION).unwrap(); + let turboquant_prep = + prepare_variant(&prepared, Variant::VortexTurboQuant, &SESSION).unwrap(); + + let unc_timings = + run_timings(&uncompressed_prep.array, &prepared.query, 3, &SESSION).unwrap(); + let tq_timings = run_timings(&turboquant_prep.array, &prepared.query, 3, &SESSION).unwrap(); + + // The uncompressed decompress should be at least an order of magnitude faster + // than TurboQuant's (usually many orders of magnitude). 5x is a loose lower + // bound that won't flake on a noisy CI runner. + assert!( + tq_timings.decompress > unc_timings.decompress * 5, + "expected TurboQuant decompress ({:?}) to be >5x uncompressed ({:?})", + tq_timings.decompress, + unc_timings.decompress + ); + } + + /// Diagnostic: print the in-memory tree shape for each variant so we can see + /// exactly what BtrBlocks and TurboQuant do to the FSL storage. + /// + /// Run with: + /// ```bash + /// cargo test -p vector-search-bench --release -- \ + /// --ignored --nocapture print_variant_trees + /// ``` + #[test] + #[ignore] + #[expect(clippy::use_debug, reason = "human-readable diagnostic output")] + fn print_variant_trees() { + let dim = 768u32; + let num_rows = 500usize; + let uncompressed = synthetic_vector(dim, num_rows, 0xC0FFEE); + + let prepared = PreparedDataset { + name: "synthetic".to_string(), + uncompressed, + query: vec![0.1f32; dim as usize], + parquet_bytes: 0, + }; + + for variant in [ + Variant::VortexUncompressed, + Variant::VortexDefault, + Variant::VortexTurboQuant, + ] { + let prep = prepare_variant(&prepared, variant, &SESSION).unwrap(); + println!("=== {variant:?} ==="); + println!(" len : {}", prep.array.len()); + println!(" nbytes : {}", prep.nbytes); + println!(" compress_duration: {:?}", prep.compress_duration); + println!( + " encoding tree : {}", + prep.array.display_tree_encodings_only() + ); + } + } } diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs index 7d65e29598f..1e8b0643bc7 100644 --- a/benchmarks/vector-search-bench/src/main.rs +++ b/benchmarks/vector-search-bench/src/main.rs @@ -17,6 +17,7 @@ use std::borrow::Cow; use std::path::PathBuf; +use anyhow::Context; use anyhow::Result; use clap::Parser; use indicatif::ProgressBar; @@ -28,6 +29,9 @@ use vector_search_bench::prepare_variant; use vector_search_bench::recall::DEFAULT_TOP_K; use vector_search_bench::recall::measure_recall_at_k; use vector_search_bench::run_timings; +use vector_search_bench::verify::VerificationKind; +use vector_search_bench::verify::compute_cosine_scores; +use vector_search_bench::verify::verify_variant; use vortex_bench::Format; use vortex_bench::SESSION; use vortex_bench::create_output_writer; @@ -168,8 +172,8 @@ async fn main() -> Result<()> { let mut timings: Vec = Vec::new(); let mut sizes: Vec = Vec::new(); - let mut recalls: Vec = Vec::new(); + let mut verification: Vec = Vec::new(); for dataset in &datasets { let prepared = prepare_dataset(dataset).await?; @@ -180,11 +184,62 @@ async fn main() -> Result<()> { prepared.num_rows() ); + // Ground-truth cosine scores for the verification query — the scores produced by + // the uncompressed Vortex scan. Every other variant (including the parquet + // hand-rolled loop) will be compared against this. + let baseline_scores = + compute_cosine_scores(&prepared.uncompressed, &prepared.query, &SESSION) + .context("compute ground-truth cosine scores for verification")?; + tracing::info!( + "computed {} ground-truth cosine scores for {}", + baseline_scores.len(), + prepared.name + ); + // Parquet-Arrow baseline. Emitted as a separate pseudo-variant with label // `parquet` / Format::Parquet so it shows up in dashboards next to the Vortex - // variants. + // variants. The parquet baseline uses a hand-rolled Rust cosine loop; it must + // match the Vortex cosine scores within lossless tolerance (f32 ULPs) because + // it's computing the same math on the same underlying f32 values. if run_parquet_baseline { let parquet_path = dataset.to_parquet_path().await?; + let baseline_data = + vector_search_bench::parquet_baseline::read_parquet_embedding_column(&parquet_path) + .context("read parquet emb column for verification")?; + let parquet_scores = vector_search_bench::parquet_baseline::cosine_loop( + &baseline_data.elements, + baseline_data.num_rows, + baseline_data.dim, + &prepared.query, + ); + let parquet_report = vector_search_bench::verify::verify_scores( + &baseline_scores, + &parquet_scores, + VerificationKind::Lossless, + ); + if !parquet_report.passed { + anyhow::bail!( + "parquet baseline correctness check failed on {}: \ + max_abs_diff={:.6}, mean_abs_diff={:.6}, tolerance={:.6}", + prepared.name, + parquet_report.max_abs_diff, + parquet_report.mean_abs_diff, + parquet_report.tolerance(), + ); + } + tracing::info!( + "parquet/{} verification: max_abs_diff={:.2e}, mean_abs_diff={:.2e}", + prepared.name, + parquet_report.max_abs_diff, + parquet_report.mean_abs_diff, + ); + verification.push(CustomUnitMeasurement { + name: format!("correctness-max-diff/parquet/{}", prepared.name), + format: Format::Parquet, + unit: Cow::from("abs-diff"), + value: parquet_report.max_abs_diff, + }); + let baseline_timings = run_parquet_baseline_timings( &parquet_path, &prepared.query, @@ -202,9 +257,9 @@ async fn main() -> Result<()> { value: prepared.parquet_bytes as f64, }); timings.push(CompressionTimingMeasurement { - name: format!("decode time/{bench_name}"), + name: format!("decompress time/{bench_name}"), format: Format::Parquet, - time: baseline_timings.decode, + time: baseline_timings.decompress, }); timings.push(CompressionTimingMeasurement { name: format!("cosine-similarity time/{bench_name}"), @@ -219,25 +274,67 @@ async fn main() -> Result<()> { } for &variant in &variants { - let (variant_array, size_bytes) = prepare_variant(&prepared, variant, &SESSION).await?; + let prep = prepare_variant(&prepared, variant, &SESSION)?; let variant_label = variant.label(); let bench_name = format!("{variant_label}/{}", prepared.name); + // Correctness verification BEFORE timing. Lossless variants must match + // the uncompressed baseline within f32 noise; TurboQuant must stay within + // its lossy tolerance. A failure bails the whole run — you cannot publish + // throughput numbers for an encoding that returns wrong answers. + let kind = if variant == Variant::VortexTurboQuant { + VerificationKind::Lossy + } else { + VerificationKind::Lossless + }; + let report = verify_variant( + &bench_name, + &prep.array, + &prepared.query, + &baseline_scores, + kind, + &SESSION, + )?; + tracing::info!( + "{} verification ({:?}): max_abs_diff={:.2e}, mean_abs_diff={:.2e}", + bench_name, + kind, + report.max_abs_diff, + report.mean_abs_diff, + ); + verification.push(CustomUnitMeasurement { + name: format!("correctness-max-diff/{bench_name}"), + format: variant.as_format(), + unit: Cow::from("abs-diff"), + value: report.max_abs_diff, + }); + + // In-memory nbytes — the honest size of the variant tree we're executing. sizes.push(CustomUnitMeasurement { - name: format!("{variant_label} size/{}", prepared.name), + name: format!("{variant_label} nbytes/{}", prepared.name), format: variant.as_format(), unit: Cow::from("bytes"), - value: size_bytes as f64, + value: prep.nbytes as f64, + }); + + // Compress time — the wall time it takes to build the variant tree from + // the materialized uncompressed source. For the uncompressed variant + // itself this is ~0 (identity), so we still emit it as a measurement for + // dashboard consistency. + timings.push(CompressionTimingMeasurement { + name: format!("compress time/{bench_name}"), + format: variant.as_format(), + time: prep.compress_duration, }); let variant_timings = - run_timings(&variant_array, &prepared.query, args.iterations, &SESSION)?; + run_timings(&prep.array, &prepared.query, args.iterations, &SESSION)?; timings.push(CompressionTimingMeasurement { - name: format!("decode time/{bench_name}"), + name: format!("decompress time/{bench_name}"), format: variant.as_format(), - time: variant_timings.decode, + time: variant_timings.decompress, }); timings.push(CompressionTimingMeasurement { name: format!("cosine-similarity time/{bench_name}"), @@ -251,12 +348,12 @@ async fn main() -> Result<()> { }); // Recall@K quality measurement for lossy variants only. The lossless - // variants (uncompressed + BtrBlocks default) are trivially 1.0 against - // the uncompressed ground truth, so we skip them to avoid noise. + // variants are trivially 1.0 by construction (since they agree with the + // uncompressed baseline within 1e-4) so we skip them to keep noise down. if args.recall_queries > 0 && variant == Variant::VortexTurboQuant { let recall = measure_recall_at_k( &prepared.uncompressed, - &variant_array, + &prep.array, args.recall_queries, args.recall_k, &SESSION, @@ -295,11 +392,15 @@ async fn main() -> Result<()> { recall.name, recall.value, recall.unit )?; } + for check in &verification { + writeln!(writer, "{} {:.6e} {}", check.name, check.value, check.unit)?; + } } DisplayFormat::GhJson => { print_measurements_json(&mut writer, timings)?; print_measurements_json(&mut writer, sizes)?; print_measurements_json(&mut writer, recalls)?; + print_measurements_json(&mut writer, verification)?; } } diff --git a/benchmarks/vector-search-bench/src/parquet_baseline.rs b/benchmarks/vector-search-bench/src/parquet_baseline.rs index b80a2308370..222a125f362 100644 --- a/benchmarks/vector-search-bench/src/parquet_baseline.rs +++ b/benchmarks/vector-search-bench/src/parquet_baseline.rs @@ -172,14 +172,14 @@ pub fn run_parquet_baseline_timings( threshold: f32, iterations: usize, ) -> Result { - let mut decode = Duration::MAX; + let mut decompress = Duration::MAX; let mut cosine = Duration::MAX; let mut filter = Duration::MAX; for _ in 0..iterations { let start = Instant::now(); let data = read_parquet_embedding_column(parquet_path)?; - decode = decode.min(start.elapsed()); + decompress = decompress.min(start.elapsed()); let start = Instant::now(); let scores = cosine_loop(&data.elements, data.num_rows, data.dim, query); @@ -193,7 +193,7 @@ pub fn run_parquet_baseline_timings( } Ok(VariantTimings { - decode, + decompress, cosine, filter, }) diff --git a/benchmarks/vector-search-bench/src/recall.rs b/benchmarks/vector-search-bench/src/recall.rs index 15132561b4d..1c587616d4e 100644 --- a/benchmarks/vector-search-bench/src/recall.rs +++ b/benchmarks/vector-search-bench/src/recall.rs @@ -16,15 +16,13 @@ use anyhow::Result; use vortex::array::ArrayRef; -use vortex::array::IntoArray; use vortex::array::VortexSessionExecute; use vortex::array::arrays::PrimitiveArray; use vortex::array::arrays::extension::ExtensionArrayExt; -use vortex::error::VortexExpect; use vortex::session::VortexSession; use vortex::utils::aliases::hash_set::HashSet; -use vortex_tensor::scalar_fns::cosine_similarity::CosineSimilarity; -use vortex_tensor::vector_search::build_constant_query_vector; + +use crate::verify::compute_cosine_scores; /// Size of the neighbour set we compare. 10 is the standard VectorDBBench default. pub const DEFAULT_TOP_K: usize = 10; @@ -64,10 +62,10 @@ pub fn measure_recall_at_k( let row = (q * step).min(num_rows - 1); let query = extract_query_row(uncompressed, row, session)?; - let gt_scores = score_all_rows(uncompressed, &query, session)?; + let gt_scores = compute_cosine_scores(uncompressed, &query, session)?; let truth = top_k_indices(>_scores, top_k); - let lossy_scores = score_all_rows(compressed, &query, session)?; + let lossy_scores = compute_cosine_scores(compressed, &query, session)?; let lossy = top_k_indices(&lossy_scores, top_k); let truth_set: HashSet = truth.iter().copied().collect(); @@ -105,18 +103,6 @@ fn extract_query_row( Ok(slice[start..start + dim_usize].to_vec()) } -fn score_all_rows(data: &ArrayRef, query: &[f32], session: &VortexSession) -> Result> { - let num_rows = data.len(); - let query_vec = build_constant_query_vector(query, num_rows)?; - let cosine = CosineSimilarity::try_new_array(data.clone(), query_vec, num_rows) - .vortex_expect("cosine similarity accepts matching Vector inputs") - .into_array(); - - let mut ctx = session.create_execution_ctx(); - let scores: PrimitiveArray = cosine.execute(&mut ctx)?; - Ok(scores.as_slice::().to_vec()) -} - /// Return the indices of the top-K highest scores, stable-sorted descending. fn top_k_indices(scores: &[f32], top_k: usize) -> Vec { let mut idx: Vec = (0..scores.len()).collect(); @@ -164,22 +150,12 @@ mod tests { parquet_bytes: 0, }; - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - let (tq_array, _) = rt - .block_on(prepare_variant( - &prepared, - Variant::VortexTurboQuant, - &SESSION, - )) - .unwrap(); + let tq_prep = prepare_variant(&prepared, Variant::VortexTurboQuant, &SESSION).unwrap(); // With only 64 random rows, recall@10 won't be 1.0 but it should be well // above chance (10/64 ≈ 0.156). The test asserts a loose lower bound to catch // total regressions without being flaky on distribution noise. - let recall = measure_recall_at_k(&uncompressed, &tq_array, 4, 10, &SESSION).unwrap(); + let recall = measure_recall_at_k(&uncompressed, &tq_prep.array, 4, 10, &SESSION).unwrap(); assert!( recall >= 0.3, "TurboQuant recall@10 on 64×128 synthetic data should be ≥0.3, got {recall}", diff --git a/benchmarks/vector-search-bench/src/verify.rs b/benchmarks/vector-search-bench/src/verify.rs new file mode 100644 index 00000000000..7dd6a564f36 --- /dev/null +++ b/benchmarks/vector-search-bench/src/verify.rs @@ -0,0 +1,344 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Correctness verification for vector-search variants. +//! +//! Before the timing loop runs, we compute cosine-similarity scores for a single query +//! row against the uncompressed baseline and against each prepared variant, then compare +//! the two score vectors element-by-element. This catches two distinct classes of bug: +//! +//! - A **lossless variant** that disagrees with the uncompressed scan (bug in the +//! compression pipeline, or in how we're routing through the scalar-fn dispatch, or in +//! the variant-specific decompress path). +//! - A **lossy variant** (TurboQuant) that drifts further from ground truth than we +//! expect from the bit-width and SORF rotation settings (regression in the encoder). +//! +//! The same `execute_cosine` function the timing loop uses is also what verification +//! uses, so the correctness check is validating the *exact* expression tree we're about +//! to benchmark. Lossless variants must match within [`LOSSLESS_TOLERANCE`]; lossy +//! variants must match within [`LOSSY_TOLERANCE`]. A hard-stop `Err` return on any +//! mismatch keeps the benchmark honest — you cannot publish throughput numbers for a +//! variant that's returning garbage. + +use anyhow::Result; +use anyhow::bail; +use vortex::array::ArrayRef; +use vortex::array::VortexSessionExecute; +use vortex::session::VortexSession; + +use crate::execute_cosine; + +/// Maximum acceptable absolute difference in cosine scores for a *lossless* variant +/// (uncompressed, BtrBlocks-default). `cosine_similarity` traverses the FSL storage and +/// reduces with f32 accumulators, so a pure algebraic change of encoding can shift a +/// score by a few ULPs of f32 precision. `1e-4` is well above that noise floor while +/// still catching real regressions. +pub const LOSSLESS_TOLERANCE: f32 = 1e-4; + +/// Maximum acceptable absolute difference in cosine scores for the *lossy* TurboQuant +/// variant. At the default 8-bit configuration the reconstructed dot product typically +/// drifts by well under 0.05 for unit-normalized vectors. `0.2` is a loose upper bound +/// that catches regressions without flaking on distribution-specific noise. +pub const LOSSY_TOLERANCE: f32 = 0.2; + +/// Row index used to pick the verification query. Row 0 is also what +/// [`crate::DEFAULT_QUERY_ROW`] selects, so the verification and timing paths exercise +/// identical inputs. +pub const VERIFICATION_QUERY_ROW: usize = 0; + +/// How lossy a variant is allowed to be when its scores are compared to the +/// uncompressed baseline. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum VerificationKind { + /// Lossless variants must match within [`LOSSLESS_TOLERANCE`]. + Lossless, + /// Lossy variants must match within [`LOSSY_TOLERANCE`]. + Lossy, +} + +/// Per-variant correctness report. Captured for both pass and fail outcomes so the +/// caller can emit the numbers as dashboard measurements regardless. +#[derive(Debug, Clone, Copy)] +pub struct VerificationReport { + /// Number of rows compared (== dataset row count). + pub num_scores: usize, + /// Mean absolute difference between baseline and variant cosine scores. + pub mean_abs_diff: f64, + /// Max absolute difference between baseline and variant cosine scores. + pub max_abs_diff: f64, + /// Which tolerance band applied. + pub kind: VerificationKind, + /// Whether the variant's max-abs-diff stayed within its tolerance. + pub passed: bool, +} + +impl VerificationReport { + /// The tolerance that was applied to produce [`Self::passed`]. + pub fn tolerance(&self) -> f32 { + match self.kind { + VerificationKind::Lossless => LOSSLESS_TOLERANCE, + VerificationKind::Lossy => LOSSY_TOLERANCE, + } + } +} + +/// Compute cosine-similarity scores for a single query row on `data` and return them +/// as a plain `Vec`. This is just a convenience wrapper around +/// [`crate::execute_cosine`] that pulls the f32 slice out of the resulting +/// `PrimitiveArray`. +pub fn compute_cosine_scores( + data: &ArrayRef, + query: &[f32], + session: &VortexSession, +) -> Result> { + let mut ctx = session.create_execution_ctx(); + let scores = execute_cosine(data, query, &mut ctx)?; + Ok(scores.as_slice::().to_vec()) +} + +/// Compare two equal-length score vectors and return their mean absolute difference +/// and max absolute difference, without evaluating a pass/fail threshold. +pub fn compare_scores(baseline: &[f32], other: &[f32]) -> (f64, f64) { + assert_eq!( + baseline.len(), + other.len(), + "compare_scores: length mismatch baseline={} other={}", + baseline.len(), + other.len(), + ); + + if baseline.is_empty() { + return (0.0, 0.0); + } + + let mut sum = 0.0f64; + let mut max: f64 = 0.0; + for (&b, &o) in baseline.iter().zip(other.iter()) { + // Treat (+0, -0) pairs as equal and propagate NaN as inf so it always fails + // the tolerance check below. + let diff = if b.is_nan() || o.is_nan() { + f64::INFINITY + } else { + (f64::from(b) - f64::from(o)).abs() + }; + sum += diff; + if diff > max { + max = diff; + } + } + (sum / baseline.len() as f64, max) +} + +/// Verify one variant's scores against a baseline and produce a full +/// [`VerificationReport`]. Whether `passed` is true depends on `kind`'s tolerance. +pub fn verify_scores( + baseline: &[f32], + variant_scores: &[f32], + kind: VerificationKind, +) -> VerificationReport { + let (mean_abs_diff, max_abs_diff) = compare_scores(baseline, variant_scores); + let tolerance = match kind { + VerificationKind::Lossless => f64::from(LOSSLESS_TOLERANCE), + VerificationKind::Lossy => f64::from(LOSSY_TOLERANCE), + }; + let passed = max_abs_diff <= tolerance; + VerificationReport { + num_scores: baseline.len(), + mean_abs_diff, + max_abs_diff, + kind, + passed, + } +} + +/// End-to-end variant verification: executes cosine on `variant_array` against the +/// same query used for the baseline and returns a [`VerificationReport`]. Returns +/// `Err` if `kind` is [`VerificationKind::Lossless`] and the scores disagree beyond +/// [`LOSSLESS_TOLERANCE`] — that indicates a real correctness bug, not a quality +/// tradeoff. +pub fn verify_variant( + variant_name: &str, + variant_array: &ArrayRef, + query: &[f32], + baseline_scores: &[f32], + kind: VerificationKind, + session: &VortexSession, +) -> Result { + let scores = compute_cosine_scores(variant_array, query, session)?; + let report = verify_scores(baseline_scores, &scores, kind); + + if !report.passed { + let message = format!( + "{variant_name} correctness check failed: max_abs_diff={:.6}, \ + mean_abs_diff={:.6}, tolerance={:.6} ({:?})", + report.max_abs_diff, + report.mean_abs_diff, + report.tolerance(), + report.kind, + ); + match kind { + VerificationKind::Lossless => bail!("{message}"), + VerificationKind::Lossy => { + tracing::warn!("{message}"); + } + } + } + + Ok(report) +} + +#[cfg(test)] +mod tests { + use vortex_bench::SESSION; + + use super::*; + use crate::Variant; + use crate::prepare_variant; + use crate::test_utils::synthetic_vector; + + fn make_prepared(dim: u32, num_rows: usize, seed: u64) -> crate::PreparedDataset { + let uncompressed = synthetic_vector(dim, num_rows, seed); + crate::PreparedDataset { + name: "synthetic".to_string(), + uncompressed, + // Filled in below from row 0. + query: vec![], + parquet_bytes: 0, + } + } + + fn extract_row_zero(uncompressed: &ArrayRef, dim: u32) -> Vec { + use vortex::array::VortexSessionExecute; + use vortex::array::arrays::Extension; + use vortex::array::arrays::FixedSizeListArray; + use vortex::array::arrays::PrimitiveArray; + use vortex::array::arrays::extension::ExtensionArrayExt; + use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt; + + let mut ctx = SESSION.create_execution_ctx(); + let ext = uncompressed.as_opt::().unwrap(); + let fsl: FixedSizeListArray = ext.storage_array().clone().execute(&mut ctx).unwrap(); + let elements: PrimitiveArray = fsl.elements().clone().execute(&mut ctx).unwrap(); + elements.as_slice::()[..dim as usize].to_vec() + } + + #[test] + fn compare_scores_handles_empty() { + let (mean, max) = compare_scores(&[], &[]); + assert_eq!(mean, 0.0); + assert_eq!(max, 0.0); + } + + #[test] + fn compare_scores_computes_mae_and_max() { + let base = [0.0f32, 1.0, 2.0, 3.0]; + let other = [0.0f32, 1.0, 2.5, 3.0]; + let (mean, max) = compare_scores(&base, &other); + assert!((max - 0.5).abs() < 1e-9); + assert!((mean - 0.125).abs() < 1e-9); + } + + #[test] + fn verify_scores_passes_for_identical_inputs() { + let base = [0.5f32; 10]; + let report = verify_scores(&base, &base, VerificationKind::Lossless); + assert!(report.passed); + assert_eq!(report.max_abs_diff, 0.0); + assert_eq!(report.mean_abs_diff, 0.0); + assert_eq!(report.num_scores, 10); + } + + #[test] + fn verify_scores_fails_for_lossless_beyond_tolerance() { + let base = [0.5f32; 10]; + let mut other = [0.5f32; 10]; + other[3] = 0.50001; // diff ≈ 1e-5, comfortably below the 1e-4 lossless bound + let report_ok = verify_scores(&base, &other, VerificationKind::Lossless); + assert!( + report_ok.passed, + "1e-5 drift should pass, got max={:.2e}", + report_ok.max_abs_diff + ); + + other[3] = 0.51; // diff of 0.01, well above 1e-4 + let report_bad = verify_scores(&base, &other, VerificationKind::Lossless); + assert!( + !report_bad.passed, + "1e-2 drift should fail, got max={:.2e}", + report_bad.max_abs_diff + ); + } + + #[test] + fn verify_scores_lossy_tolerates_small_drift() { + let base = [0.9f32; 10]; + let mut other = [0.9f32; 10]; + other[0] = 1.0; // diff of 0.1 + let report = verify_scores(&base, &other, VerificationKind::Lossy); + assert!( + report.passed, + "0.1 drift should pass lossy tolerance, got max={}", + report.max_abs_diff + ); + } + + #[test] + fn verify_scores_fails_on_nan() { + let base = [0.5f32, 0.5]; + let other = [0.5f32, f32::NAN]; + let report = verify_scores(&base, &other, VerificationKind::Lossless); + assert!(!report.passed); + assert!(report.max_abs_diff.is_infinite()); + } + + #[test] + fn vortex_default_matches_uncompressed_end_to_end() { + let dim = 128u32; + let num_rows = 64usize; + let mut prepared = make_prepared(dim, num_rows, 0xC0FFEE); + prepared.query = extract_row_zero(&prepared.uncompressed, dim); + + let baseline_scores = + compute_cosine_scores(&prepared.uncompressed, &prepared.query, &SESSION).unwrap(); + + let default_prep = prepare_variant(&prepared, Variant::VortexDefault, &SESSION).unwrap(); + let report = verify_variant( + "vortex-default", + &default_prep.array, + &prepared.query, + &baseline_scores, + VerificationKind::Lossless, + &SESSION, + ) + .expect("vortex-default must be lossless against the uncompressed baseline"); + assert!(report.passed); + } + + #[test] + fn vortex_turboquant_stays_within_lossy_tolerance() { + let dim = 128u32; + let num_rows = 64usize; + let mut prepared = make_prepared(dim, num_rows, 0xDEADBEEF); + prepared.query = extract_row_zero(&prepared.uncompressed, dim); + + let baseline_scores = + compute_cosine_scores(&prepared.uncompressed, &prepared.query, &SESSION).unwrap(); + + let tq_prep = prepare_variant(&prepared, Variant::VortexTurboQuant, &SESSION).unwrap(); + let report = verify_variant( + "vortex-turboquant", + &tq_prep.array, + &prepared.query, + &baseline_scores, + VerificationKind::Lossy, + &SESSION, + ) + .expect("TurboQuant verification should not error"); + assert!( + report.passed, + "TurboQuant drift {:.4} exceeds lossy tolerance {:.4}", + report.max_abs_diff, + report.tolerance() + ); + } +} From 236e098b091aea37567b75385356478d0ff939fb Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 11 Apr 2026 14:40:52 +0000 Subject: [PATCH 10/18] =?UTF-8?q?vector-search-bench:=20rename=20"parquet"?= =?UTF-8?q?=20baseline=20=E2=86=92=20"handrolled"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The compute side of the baseline isn't really a parquet implementation — it's a **hand-rolled Rust scalar cosine loop over a flat `Vec`** that happens to have been decoded from a parquet file via `parquet-rs` / `arrow-rs`. Labeling it as "parquet" in CLI flags, metric names, and docs suggested the baseline was measuring a real parquet-based analytical engine, when in reality it's the minimum cost a Rust programmer could get away with given no query engine. User-facing rename: everywhere the baseline was called "parquet" it now says "handrolled" — the CLI format, the metric labels, the CI matrix entry, the README, the type and function names in the baseline module, and the module filename itself (`parquet_baseline.rs` → `handrolled_baseline.rs`). Unchanged: - `read_parquet_embedding_column` keeps its name because it really does read parquet. Only the baseline-level wrappers take the `handrolled` label. - `target.format` stays `Format::Parquet` — the storage side really is parquet on disk, and dashboards grouping by format should still put the baseline's size/decompress measurements under "parquet". The `CompressionTimingMeasurement::to_json` prefix therefore still reads `"parquet_rs-zstd "`, which honestly describes the parquet reader stack. Only the compute label in the metric `name` string reflects the hand-rolled compute path. The module-level doc comment on `handrolled_baseline.rs` now explicitly frames the baseline as a compute-cost *floor*, not a fair DBMS comparison, and points at DuckDB `list_cosine_similarity` and DataFusion vector UDFs as better "what real users pay for parquet+cosine" baselines for future work. Verified end-to-end against the 5000×768 synthetic cohere-small fixture. All 13 existing tests still pass (the baseline test rename from `parquet_baseline_reads_fsl_column` to `handrolled_baseline_reads_fsl_column` is the only functional change to tests). Correctness verification still reports: handrolled max_abs_diff = 0.000e0 (bit-identical, lossless) vortex-uncompressed max_abs_diff = 0.000e0 vortex-default max_abs_diff = 0.000e0 vortex-turboquant max_abs_diff = 5.18e-3 (within 0.2 lossy tolerance) Signed-off-by: Claude Signed-off-by: Connor Tsui --- .github/workflows/bench.yml | 2 +- benchmarks/vector-search-bench/README.md | 109 +++++++++++++----- ...uet_baseline.rs => handrolled_baseline.rs} | 86 +++++++++----- benchmarks/vector-search-bench/src/lib.rs | 2 +- benchmarks/vector-search-bench/src/main.rs | 86 ++++++++------ 5 files changed, 188 insertions(+), 97 deletions(-) rename benchmarks/vector-search-bench/src/{parquet_baseline.rs => handrolled_baseline.rs} (69%) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index f700aac0a78..f900022e2f6 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -51,7 +51,7 @@ jobs: - id: vector-search-bench name: Vector Similarity Search build_args: "" - formats: "parquet,vortex-uncompressed,vortex-default,vortex-turboquant" + formats: "handrolled,vortex-uncompressed,vortex-default,vortex-turboquant" steps: - uses: runs-on/action@v2 if: github.repository == 'vortex-data/vortex' diff --git a/benchmarks/vector-search-bench/README.md b/benchmarks/vector-search-bench/README.md index 7a10508910f..49f117e5c26 100644 --- a/benchmarks/vector-search-bench/README.md +++ b/benchmarks/vector-search-bench/README.md @@ -5,55 +5,88 @@ embedding corpora. ## What it measures -For each `(dataset, format)` pair, the benchmark records four numbers: - -1. **Size** — compressed storage footprint in bytes. For the Vortex variants - that round-trip through `.vortex` files today (uncompressed & BtrBlocks - default) this is the real on-disk size. For `vortex-turboquant` it is - the in-memory `.nbytes()` footprint, because the `L2Denorm` scalar-fn - array does not yet have a concrete `serialize_metadata` implementation. -2. **Full-scan decode time** — wall time to materialize the whole `Vector` - column into a `FixedSizeListArray`. -3. **Cosine-similarity execute time** — wall time for - `CosineSimilarity(data, const_query)` executed to a materialized f32 array. -4. **Cosine-filter execute time** — wall time for the full - `Binary(Gt, [CosineSimilarity, threshold])` expression tree executed to - a `BoolArray`. - -The TurboQuant variant additionally reports **Recall@10** against the -uncompressed Vortex scan as local ground truth. Lossless variants are trivially -1.0 so they are not re-measured. +For each `(dataset, format)` pair, the benchmark records: + +1. **`nbytes`** — in-memory footprint of the variant's array tree, in bytes. + Reporting the in-memory `.nbytes()` instead of an on-disk file size is + deliberate: the Vortex default write path runs BtrBlocks on every tree + regardless of whether it's already compressed, so "on-disk size" would + collapse `vortex-uncompressed` and `vortex-default` to the same bytes + even though their in-memory trees are different. The `nbytes()` + number is consistent with what the *compute* measurements actually + operate on. + - The `handrolled` baseline reports the canonical parquet file size + on disk — that's the only encoded representation it has. +2. **Compress time** — wall time to build the variant tree from the + materialized uncompressed source. ~0 for `vortex-uncompressed` (identity), + meaningful for the two compressed variants. +3. **Decompress time** — wall time to execute the variant tree all the way + back into a canonical `FixedSizeListArray` with a materialized f32 + element buffer. For `vortex-uncompressed` this is a no-op; for + `vortex-default` it includes ALP-RD bit-unpacking; for + `vortex-turboquant` it includes the inverse SORF rotation and + dictionary lookup. +4. **Cosine-similarity time** — `CosineSimilarity(data, const_query)` + executed to a materialized f32 array. +5. **Cosine-filter time** — `Binary(Gt, [CosineSimilarity, threshold])` + executed to a `BoolArray`. +6. **Recall@10** (TurboQuant only) — the fraction of the exact top-10 + nearest neighbours that TurboQuant recovers, using the uncompressed + Vortex scan as local ground truth. + +Before any timing starts, the benchmark runs a **correctness verification +pass**: cosine scores for a single query are computed against every +variant and compared to the uncompressed baseline. Lossless variants must +match within `1e-4` max-abs-diff; TurboQuant must stay within `0.2`. A +mismatch bails the run — you cannot publish throughput numbers for a +variant that returns wrong answers. ## Formats -- `parquet` — Parquet file read via `parquet::arrow` into an Arrow - `FixedSizeListArray`, then a hand-rolled Rust cosine loop. This is the - "what you'd do without Vortex" external floor. +- `handrolled` — Hand-rolled Rust scalar cosine loop over a flat + `Vec` that was decoded from the canonical parquet file via + `parquet-rs` / `arrow-rs`. The **decompress** phase does the parquet + read, downcasts to `Float32Array`, and memcpies into a plain `Vec`. + The **compute** phase is a plain scalar loop over `&[f32]` — no Arrow + compute kernels, no scalar-function dispatch, no SIMD annotations. + + This is a **compute-cost floor**, not a realistic parquet-on-DBMS + baseline. It answers the question "what's the minimum cost you could + get away with if you wrote a vector-search scan by hand with no query + engine?" Real parquet users would pay substantially more (DuckDB + `list_cosine_similarity`, DataFusion with a vector UDF, etc.) — + adding those as additional baselines is a natural v2 direction. - `vortex-uncompressed` — Raw `Vector` extension array, no encoding-level compression applied. - `vortex-default` — `BtrBlocksCompressor::default()` applied to the FSL - storage child. Generic lossless Vortex compression for float vectors. + storage child. On float vectors this typically finds ~15% lossless + savings via ALP-RD (mantissa/exponent split + bitpacking). - `vortex-turboquant` — The full `L2Denorm(SorfTransform(FSL(Dict(codes, centroids))), norms)` pipeline. - Lossy; recall@10 is reported alongside throughput. + Lossy; recall@10 is reported alongside throughput. At the default 8-bit + config this typically gives ~3× storage reduction at >90% top-10 + recall. ## Datasets -The first dataset wired up is **Cohere-100K** (`cohere-small`): 100K rows × -768 dims, cosine metric, ~150 MB zstd-parquet. This is the smallest -VectorDBBench-supplied embedding corpus and sits comfortably inside a CI -time / bandwidth budget. +The smallest built-in dataset is **Cohere-100K** (`cohere-small`): 100K +rows × 768 dims, cosine metric, ~150 MB zstd-parquet. It's the smallest +VectorDBBench-supplied corpus that still exercises every encoding path. +Larger variants (`cohere-medium`, `openai-small`, `openai-medium`, +`bioasq-medium`, `glove-medium`) are wired up for local / on-demand +experiments; see `vortex-bench/src/vector_dataset.rs` for the full list. -The upstream URL is -`https://assets.zilliz.com/benchmark/cohere_small_100k/train.parquet`. The -public Zilliz bucket is anonymous-readable so the code _can_ hit it directly. +The upstream URL for Cohere-100K is +`https://assets.zilliz.com/benchmark/cohere_small_100k/train.parquet`. +The public Zilliz bucket is anonymous-readable so the code can hit it +directly. ## Running locally ```bash cargo run -p vector-search-bench --release -- \ --datasets cohere-small \ - --formats parquet,vortex-uncompressed,vortex-default,vortex-turboquant \ + --formats handrolled,vortex-uncompressed,vortex-default,vortex-turboquant \ --iterations 5 \ -d table ``` @@ -62,6 +95,20 @@ The first run downloads the parquet file into `vortex-bench/data/cohere-small/cohere-small.parquet` and caches it idempotently for subsequent runs. +### Running without network access + +The `gen_synthetic_dataset` helper writes a VectorDBBench-shape parquet +file (`id: int64` + `emb: list`, zstd-compressed) at any path. +Use it to populate the dataset cache so the benchmark's idempotent +download step skips the HTTP fetch: + +```bash +cargo run -p vector-search-bench --bin gen_synthetic_dataset --release -- \ + --num-rows 5000 \ + --dim 768 \ + --out vortex-bench/data/cohere-small/cohere-small.parquet +``` + ## CI note: dataset mirror CI runs after every develop-branch merge. Hitting `assets.zilliz.com` diff --git a/benchmarks/vector-search-bench/src/parquet_baseline.rs b/benchmarks/vector-search-bench/src/handrolled_baseline.rs similarity index 69% rename from benchmarks/vector-search-bench/src/parquet_baseline.rs rename to benchmarks/vector-search-bench/src/handrolled_baseline.rs index 222a125f362..85267c358e2 100644 --- a/benchmarks/vector-search-bench/src/parquet_baseline.rs +++ b/benchmarks/vector-search-bench/src/handrolled_baseline.rs @@ -1,27 +1,42 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Parquet-Arrow hand-rolled cosine similarity baseline. +//! Hand-rolled Rust cosine similarity baseline. //! -//! This module provides the "what you'd do without Vortex" floor for the vector-search -//! benchmark. It reads the canonical parquet file for a dataset via `parquet::arrow`, -//! decodes the `emb` column to an Arrow `FixedSizeListArray`, and then runs a -//! straightforward Rust cosine-similarity loop — no scalar functions, no lazy expressions, -//! no index. +//! This module provides the *compute-cost floor* the other Vortex variants are measured +//! against. It is **not** a realistic "parquet in a DBMS" baseline — it's the minimum +//! amount of work a Rust programmer could get away with if they wrote a vector-search +//! scan by hand with no query engine, no scalar-function dispatch, and no Arrow compute +//! kernels. //! -//! The four measurements produced mirror those of the Vortex variants so dashboards can -//! put the parquet bar right next to the vortex bars: +//! Two distinct phases run per iteration, and the benchmark times them separately so the +//! dashboard can separate storage-read cost from compute cost: //! -//! 1. Compressed size — the on-disk parquet file in bytes. -//! 2. Full-scan decode time — parquet → arrow record batches → concatenated -//! `FixedSizeListArray`. -//! 3. Cosine-similarity execute time — hand-rolled loop producing a `Vec` of scores. -//! 4. Filter execute time — the same loop materializing into a `Vec` where -//! `score > threshold`. +//! 1. **Decompress** ([`read_parquet_embedding_column`]) — reads the canonical parquet +//! file via `parquet-rs`, downcasts the `emb` column to an Arrow `Float32Array`, and +//! copies every value into a flat `Vec`. This phase is the only place Arrow is +//! actually used — only for the decode. The `memcpy` at the end is incidental: we +//! could operate directly on `Float32Array::values()` with identical performance, +//! but taking ownership of a `Vec` frees the Arrow `RecordBatch` lifetimes. +//! 2. **Compute** ([`cosine_loop`] and [`filter_loop`]) — runs a plain scalar Rust loop +//! over `&[f32]`. Arrow is no longer involved. There's no SIMD, no unrolling +//! annotations, no dispatch overhead, no output-array allocation beyond a single +//! `Vec`. This is deliberately "the fastest you could possibly make it go +//! without writing SIMD intrinsics". //! -//! This module does *not* include the parquet decode time in the cosine/filter wall -//! times. Decoding is treated as its own measurement. This matches how the Vortex variants -//! separate decode from compute. +//! Calling this "the parquet baseline" would be misleading, because: +//! +//! - The compute layer has nothing to do with parquet — parquet is only the input +//! encoding, not the execution substrate. +//! - Real parquet-on-DBMS engines (DuckDB's `list_cosine_similarity`, DataFusion with a +//! vector UDF, etc.) would pay substantial dispatch / planner / row-iterator cost +//! that this loop skips entirely. +//! +//! Think of it as: "If you didn't have Vortex and didn't feel like reaching for a query +//! engine, what's the minimum scan cost you could get away with on this data?" That's +//! the question this module answers, and it's intentionally a lower bound rather than a +//! fair DBMS comparison. Future work could add DuckDB / DataFusion baselines alongside +//! this one for the DBMS-level comparison. use std::fs::File; use std::path::Path; @@ -43,8 +58,15 @@ use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use crate::VariantTimings; /// Read the entire `emb` column of a parquet file into a single flat `Vec`, along -/// with the dimension and row count. -pub fn read_parquet_embedding_column(parquet_path: &Path) -> Result { +/// with the dimension and row count. This is the *decompress* phase of the hand-rolled +/// baseline — it's the only place Arrow is actually used. `parquet-rs` does the file +/// decode, we downcast to `Float32Array`, and then memcpy into a plain `Vec` so +/// the compute loop can operate over a raw slice without holding any Arrow +/// `RecordBatch` references. +/// +/// Kept under its `parquet` name because this function *actually reads parquet*; only +/// the compute-side wrappers take the `handrolled` label. +pub fn read_parquet_embedding_column(parquet_path: &Path) -> Result { let file = File::open(parquet_path) .with_context(|| format!("open parquet file {}", parquet_path.display()))?; let file_size = file.metadata()?.len(); @@ -88,7 +110,7 @@ pub fn read_parquet_embedding_column(parquet_path: &Path) -> Result, new_dim: usize) -> Result<()> } } -/// The flattened representation of a parquet file's embedding column, suitable for a -/// hand-rolled distance loop. -pub struct ParquetBaselineData { +/// The flattened representation of an embedding column, suitable for a hand-rolled +/// distance loop. Intentionally decoupled from any format — the compute side doesn't +/// care how the data got into this `Vec`. +pub struct HandrolledBaselineData { /// All rows concatenated: `elements.len() == num_rows * dim`. pub elements: Vec, /// Vector dimensionality. pub dim: usize, /// Number of rows. pub num_rows: usize, - /// On-disk size of the parquet file in bytes. + /// On-disk size of the parquet file in bytes. Reported as the "handrolled size" + /// measurement because parquet is the only format this benchmark serializes to + /// on disk today — a future v2 could report additional storage formats (raw f32 + /// blob, Arrow IPC, etc.) alongside it. pub file_size: u64, } -/// Run the decode / cosine / filter baseline microbenchmarks and return the best-of-N -/// wall times. Decoding is re-parquet-reading from disk on each iteration (matches how -/// the Vortex variants also re-execute from scratch each iteration). -pub fn run_parquet_baseline_timings( +/// Run the decompress / cosine / filter microbenchmarks for the hand-rolled baseline +/// and return the best-of-N wall times. The decompress phase re-reads the parquet file +/// from disk on each iteration (matches how the Vortex variants re-execute their tree +/// from scratch each iteration), and the compute phase runs [`cosine_loop`] and +/// [`filter_loop`] over the flat `Vec` the decompress phase produced. +pub fn run_handrolled_baseline_timings( parquet_path: &Path, query: &[f32], threshold: f32, @@ -283,7 +311,7 @@ mod tests { } #[test] - fn parquet_baseline_reads_fsl_column() { + fn handrolled_baseline_reads_fsl_column() { let file = write_tiny_fsl_parquet(3, &[&[1.0, 0.0, 0.0], &[0.0, 1.0, 0.0], &[1.0, 0.0, 0.0]]) .unwrap(); diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs index fe27400dce6..9f61f62484f 100644 --- a/benchmarks/vector-search-bench/src/lib.rs +++ b/benchmarks/vector-search-bench/src/lib.rs @@ -32,7 +32,7 @@ use std::time::Duration; use std::time::Instant; -pub mod parquet_baseline; +pub mod handrolled_baseline; pub mod recall; pub mod verify; diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs index 1e8b0643bc7..4d675030f4f 100644 --- a/benchmarks/vector-search-bench/src/main.rs +++ b/benchmarks/vector-search-bench/src/main.rs @@ -9,10 +9,15 @@ //! ```bash //! cargo run -p vector-search-bench --release -- \ //! --datasets cohere-small \ -//! --variants vortex-uncompressed,vortex-default,vortex-turboquant \ +//! --formats handrolled,vortex-uncompressed,vortex-default,vortex-turboquant \ //! --iterations 5 \ //! -d table //! ``` +//! +//! The `handrolled` variant is a hand-rolled Rust scalar cosine loop over a flat +//! `Vec` decoded from the dataset's canonical parquet file; it is a compute-cost +//! floor, not a realistic parquet-on-DBMS baseline. See +//! [`handrolled_baseline`](vector_search_bench::handrolled_baseline) for details. use std::borrow::Cow; use std::path::PathBuf; @@ -23,7 +28,7 @@ use clap::Parser; use indicatif::ProgressBar; use vector_search_bench::DEFAULT_THRESHOLD; use vector_search_bench::Variant; -use vector_search_bench::parquet_baseline::run_parquet_baseline_timings; +use vector_search_bench::handrolled_baseline::run_handrolled_baseline_timings; use vector_search_bench::prepare_dataset; use vector_search_bench::prepare_variant; use vector_search_bench::recall::DEFAULT_TOP_K; @@ -61,9 +66,9 @@ struct Args { /// Which benchmark variants to run, using kebab-cased labels. The `--formats` name is /// used (instead of `--variants`) so this benchmark matches the CI invocation /// convention shared across random-access-bench / compress-bench. Accepted values: - /// `parquet`, `vortex-uncompressed`, `vortex-default`, `vortex-turboquant`. Defaults - /// to running all four. - #[arg(long, value_delimiter = ',', value_enum, default_values_t = vec![SelectableFormat::Parquet, SelectableFormat::VortexUncompressed, SelectableFormat::VortexDefault, SelectableFormat::VortexTurboQuant])] + /// `handrolled`, `vortex-uncompressed`, `vortex-default`, `vortex-turboquant`. + /// Defaults to running all four. + #[arg(long, value_delimiter = ',', value_enum, default_values_t = vec![SelectableFormat::Handrolled, SelectableFormat::VortexUncompressed, SelectableFormat::VortexDefault, SelectableFormat::VortexTurboQuant])] formats: Vec, /// Number of query rows sampled when computing Recall@K for TurboQuant. 0 disables @@ -123,9 +128,12 @@ impl SelectableDataset { #[derive(clap::ValueEnum, Clone, Copy, Debug, PartialEq, Eq)] enum SelectableFormat { - /// Parquet-Arrow hand-rolled cosine loop baseline. - #[clap(name = "parquet")] - Parquet, + /// Hand-rolled Rust scalar cosine loop over a flat `Vec` decoded from the + /// canonical parquet file via `parquet-rs` / `arrow-rs`. Compute-cost floor — + /// not a realistic parquet-on-DBMS baseline. See + /// [`vector_search_bench::handrolled_baseline`]. + #[clap(name = "handrolled")] + Handrolled, /// Raw `Vector` with no encoding compression. #[clap(name = "vortex-uncompressed")] VortexUncompressed, @@ -140,7 +148,7 @@ enum SelectableFormat { impl SelectableFormat { fn into_variant(self) -> Option { match self { - SelectableFormat::Parquet => None, + SelectableFormat::Handrolled => None, SelectableFormat::VortexUncompressed => Some(Variant::VortexUncompressed), SelectableFormat::VortexDefault => Some(Variant::VortexDefault), SelectableFormat::VortexTurboQuant => Some(Variant::VortexTurboQuant), @@ -160,7 +168,7 @@ async fn main() -> Result<()> { .map(SelectableDataset::into_dataset) .collect(); - let run_parquet_baseline = args.formats.contains(&SelectableFormat::Parquet); + let run_handrolled_baseline = args.formats.contains(&SelectableFormat::Handrolled); let variants: Vec = args .formats .iter() @@ -185,8 +193,8 @@ async fn main() -> Result<()> { ); // Ground-truth cosine scores for the verification query — the scores produced by - // the uncompressed Vortex scan. Every other variant (including the parquet - // hand-rolled loop) will be compared against this. + // the uncompressed Vortex scan. Every other variant (including the hand-rolled + // baseline) will be compared against this. let baseline_scores = compute_cosine_scores(&prepared.uncompressed, &prepared.query, &SESSION) .context("compute ground-truth cosine scores for verification")?; @@ -196,58 +204,66 @@ async fn main() -> Result<()> { prepared.name ); - // Parquet-Arrow baseline. Emitted as a separate pseudo-variant with label - // `parquet` / Format::Parquet so it shows up in dashboards next to the Vortex - // variants. The parquet baseline uses a hand-rolled Rust cosine loop; it must - // match the Vortex cosine scores within lossless tolerance (f32 ULPs) because - // it's computing the same math on the same underlying f32 values. - if run_parquet_baseline { + // Hand-rolled baseline. Emitted as a separate pseudo-variant with label + // `handrolled` so it shows up in dashboards next to the Vortex variants. This + // is a hand-rolled Rust scalar cosine loop over a flat `Vec` decoded from + // parquet via `parquet-rs`; it must match the Vortex cosine scores within the + // lossless tolerance (f32 ULPs) because it's computing the same math on the + // same underlying f32 values. + // + // `target.format` stays `Format::Parquet` because the *storage* side is still + // parquet on disk — only the *compute* is hand-rolled. The metric `name` field + // carries the `handrolled` label so human readers can tell the compute apart + // from, say, a DuckDB `list_cosine_similarity` baseline on the same parquet. + if run_handrolled_baseline { let parquet_path = dataset.to_parquet_path().await?; let baseline_data = - vector_search_bench::parquet_baseline::read_parquet_embedding_column(&parquet_path) - .context("read parquet emb column for verification")?; - let parquet_scores = vector_search_bench::parquet_baseline::cosine_loop( + vector_search_bench::handrolled_baseline::read_parquet_embedding_column( + &parquet_path, + ) + .context("read parquet emb column for verification")?; + let handrolled_scores = vector_search_bench::handrolled_baseline::cosine_loop( &baseline_data.elements, baseline_data.num_rows, baseline_data.dim, &prepared.query, ); - let parquet_report = vector_search_bench::verify::verify_scores( + let handrolled_report = vector_search_bench::verify::verify_scores( &baseline_scores, - &parquet_scores, + &handrolled_scores, VerificationKind::Lossless, ); - if !parquet_report.passed { + if !handrolled_report.passed { anyhow::bail!( - "parquet baseline correctness check failed on {}: \ + "handrolled baseline correctness check failed on {}: \ max_abs_diff={:.6}, mean_abs_diff={:.6}, tolerance={:.6}", prepared.name, - parquet_report.max_abs_diff, - parquet_report.mean_abs_diff, - parquet_report.tolerance(), + handrolled_report.max_abs_diff, + handrolled_report.mean_abs_diff, + handrolled_report.tolerance(), ); } tracing::info!( - "parquet/{} verification: max_abs_diff={:.2e}, mean_abs_diff={:.2e}", + "handrolled/{} verification: max_abs_diff={:.2e}, mean_abs_diff={:.2e}", prepared.name, - parquet_report.max_abs_diff, - parquet_report.mean_abs_diff, + handrolled_report.max_abs_diff, + handrolled_report.mean_abs_diff, ); verification.push(CustomUnitMeasurement { - name: format!("correctness-max-diff/parquet/{}", prepared.name), + name: format!("correctness-max-diff/handrolled/{}", prepared.name), format: Format::Parquet, unit: Cow::from("abs-diff"), - value: parquet_report.max_abs_diff, + value: handrolled_report.max_abs_diff, }); - let baseline_timings = run_parquet_baseline_timings( + let baseline_timings = run_handrolled_baseline_timings( &parquet_path, &prepared.query, DEFAULT_THRESHOLD, args.iterations, )?; - let label = "parquet"; + let label = "handrolled"; let bench_name = format!("{label}/{}", prepared.name); sizes.push(CustomUnitMeasurement { From d012eb1210b78090f5d6c9dde08ab56719bc72d8 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 11 Apr 2026 15:02:16 +0000 Subject: [PATCH 11/18] vector-search-bench: fix correctness bugs found in review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review caught four real bugs plus a stylistic stray import. All are small, targeted fixes. 1. `build_similarity_search_tree` (vortex-tensor) and `execute_cosine` (vector-search-bench) wrapped `CosineSimilarity::try_new_array` in `vortex_expect`, which panics on the `Err` branch. Both functions' doc comments claimed they *returned* errors on dimension mismatch, so the behavior contradicted the contract: a caller passing a wrong-dimension query vector would crash instead of seeing a clean error. Fixed by replacing `.vortex_expect(...)` with `?`. 2. The progress bar in `main.rs` was initialized with `datasets.len() * args.formats.len()` (which counts the handrolled baseline) but only `inc(1)`'d inside the Vortex-variants loop, so the bar finished at `N-1/N` whenever handrolled was selected. Fixed by incrementing inside the handrolled block and recomputing `total_work` from the actual increment sites. 3. `extract_query_row` blindly called `PrimitiveArray::as_slice::()` without checking the underlying ptype. The benchmark only ever uses f32 today, but the `Vector` extension type supports f16/f32/f64, so a future caller that accidentally passes an f64 `Vector` would get a mis-typed slice. Fixed by asserting `elements.ptype() == PType::F32` before the cast with a clear error message. While there, simplified the convoluted bounds check (was `row * dim + dim > len * dim`, reduces to `row >= len` without the multiplication). 4. `gen_synthetic_dataset.rs` and the README had shell examples using `--bin gen-synthetic-dataset` with hyphens, but cargo's default bin name is the filename minus extension — i.e., underscores. The example would fail to resolve. Fixed to `gen_synthetic_dataset`. 5. `use std::io::Write;` was stranded at the bottom of `main.rs` as a single-line import after the `main` function. Moved to the top import block for idiomatic style. All 13 vector-search-bench tests still pass. 222 vortex-tensor tests still pass. clippy and rustfmt clean. Public API unchanged — no public-api.lock regen needed. Signed-off-by: Claude Signed-off-by: Connor Tsui --- benchmarks/vector-search-bench/README.md | 5 ++- .../src/bin/gen_synthetic_dataset.rs | 5 ++- benchmarks/vector-search-bench/src/lib.rs | 44 ++++++++++++------- benchmarks/vector-search-bench/src/main.rs | 10 +++-- vortex-tensor/src/vector_search.rs | 5 +-- 5 files changed, 43 insertions(+), 26 deletions(-) diff --git a/benchmarks/vector-search-bench/README.md b/benchmarks/vector-search-bench/README.md index 49f117e5c26..50af601ec2a 100644 --- a/benchmarks/vector-search-bench/README.md +++ b/benchmarks/vector-search-bench/README.md @@ -103,12 +103,15 @@ Use it to populate the dataset cache so the benchmark's idempotent download step skips the HTTP fetch: ```bash -cargo run -p vector-search-bench --bin gen_synthetic_dataset --release -- \ +cargo run -p vector-search-bench --release --bin gen_synthetic_dataset -- \ --num-rows 5000 \ --dim 768 \ --out vortex-bench/data/cohere-small/cohere-small.parquet ``` +(Cargo's default bin name is the filename minus extension, so underscores, +not hyphens.) + ## CI note: dataset mirror CI runs after every develop-branch merge. Hitting `assets.zilliz.com` diff --git a/benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs b/benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs index 3f5500edfcf..16e96cce5b4 100644 --- a/benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs +++ b/benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs @@ -12,14 +12,15 @@ //! Example: //! //! ```bash -//! cargo run -p vector-search-bench --bin gen-synthetic-dataset --release -- \ +//! cargo run -p vector-search-bench --bin gen_synthetic_dataset --release -- \ //! --num-rows 5000 \ //! --dim 768 \ //! --out vortex-bench/data/cohere-small/cohere-small.parquet //! ``` //! //! After running this, `vector-search-bench --datasets cohere-small` will find the -//! cached parquet file and skip the HTTP download via `idempotent_async`. +//! cached parquet file and skip the HTTP download via `idempotent_async`. (Cargo's +//! default bin name is the filename minus extension — underscores, not hyphens.) use std::fs::File; use std::path::PathBuf; diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs index 9f61f62484f..bfa0aa55c8d 100644 --- a/benchmarks/vector-search-bench/src/lib.rs +++ b/benchmarks/vector-search-bench/src/lib.rs @@ -46,7 +46,6 @@ use vortex::array::VortexSessionExecute; use vortex::array::arrays::BoolArray; use vortex::array::arrays::FixedSizeListArray; use vortex::array::arrays::PrimitiveArray; -use vortex::error::VortexExpect; use vortex::session::VortexSession; use vortex_bench::Format; use vortex_bench::SESSION; @@ -237,35 +236,44 @@ fn extract_emb_column(struct_array: &ArrayRef) -> Result { } /// Pull a single row out of a `Vector` extension array as a plain `Vec`. -fn extract_query_row(vector_ext: &ArrayRef, row: usize) -> Result> { +/// +/// Only `f32`-typed `Vector` arrays are supported today — the benchmark deliberately +/// restricts itself to `f32` vectors, so we assert the element type rather than +/// quietly returning a mis-cast slice. +pub(crate) fn extract_query_row(vector_ext: &ArrayRef, row: usize) -> Result> { use vortex::array::arrays::Extension; use vortex::array::arrays::extension::ExtensionArrayExt; use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt; + use vortex::dtype::PType; - let mut ctx = SESSION.create_execution_ctx(); + if row >= vector_ext.len() { + bail!( + "query row {row} out of bounds for dataset of length {}", + vector_ext.len() + ); + } let ext_view = vector_ext .as_opt::() .context("prepared dataset must be a Vector extension array")?; + let mut ctx = SESSION.create_execution_ctx(); + // Execute storage array to its canonical FSL form. let fsl: FixedSizeListArray = ext_view.storage_array().clone().execute(&mut ctx)?; - let dim_usize = { - let vortex::dtype::DType::FixedSizeList(_, d, _) = fsl.dtype() else { - bail!("storage dtype must be FixedSizeList"); - }; - *d as usize + let dim_usize = match fsl.dtype() { + vortex::dtype::DType::FixedSizeList(_, d, _) => *d as usize, + other => bail!("storage dtype must be FixedSizeList, got {other}"), }; - if row * dim_usize + dim_usize > vector_ext.len() * dim_usize { + let elements: PrimitiveArray = fsl.elements().clone().execute(&mut ctx)?; + if elements.ptype() != PType::F32 { bail!( - "query row {row} out of bounds for dataset of length {}", - vector_ext.len() + "extract_query_row currently only supports f32 Vector columns, got {:?}", + elements.ptype() ); } - - let elements: PrimitiveArray = fsl.elements().clone().execute(&mut ctx)?; let slice = elements.as_slice::(); let start = row * dim_usize; Ok(slice[start..start + dim_usize].to_vec()) @@ -440,6 +448,12 @@ pub fn decompress_full_scan( /// Execute `CosineSimilarity(data, broadcast(query))` to a materialized `f32` /// [`PrimitiveArray`]. Shared between the timing loop and the correctness-verification /// path so both exercise the exact same expression tree. +/// +/// # Errors +/// +/// Returns an error if `data` is not a [`vortex_tensor::vector::Vector`] extension array, +/// if `query`'s length doesn't match the database vector dimension, or if the execution +/// context rejects the expression. pub fn execute_cosine( data: &ArrayRef, query: &[f32], @@ -450,9 +464,7 @@ pub fn execute_cosine( let num_rows = data.len(); let query_vec = build_constant_query_vector(query, num_rows)?; - let cosine = CosineSimilarity::try_new_array(data.clone(), query_vec, num_rows) - .vortex_expect("cosine similarity accepts matching Vector inputs") - .into_array(); + let cosine = CosineSimilarity::try_new_array(data.clone(), query_vec, num_rows)?.into_array(); Ok(cosine.execute(ctx)?) } diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs index 4d675030f4f..e6ad3ad8357 100644 --- a/benchmarks/vector-search-bench/src/main.rs +++ b/benchmarks/vector-search-bench/src/main.rs @@ -20,6 +20,7 @@ //! [`handrolled_baseline`](vector_search_bench::handrolled_baseline) for details. use std::borrow::Cow; +use std::io::Write; use std::path::PathBuf; use anyhow::Context; @@ -175,7 +176,10 @@ async fn main() -> Result<()> { .filter_map(|f| f.into_variant()) .collect(); - let total_work = datasets.len() * args.formats.len(); + // `args.formats.len()` counts both the handrolled baseline and the Vortex variants, + // so it matches the number of `progress.inc(1)` calls we'll make below (one per + // Vortex variant plus one per dataset for the handrolled path when it's enabled). + let total_work = datasets.len() * (variants.len() + usize::from(run_handrolled_baseline)); let progress = ProgressBar::new(total_work as u64); let mut timings: Vec = Vec::new(); @@ -287,6 +291,8 @@ async fn main() -> Result<()> { format: Format::Parquet, time: baseline_timings.filter, }); + + progress.inc(1); } for &variant in &variants { @@ -422,5 +428,3 @@ async fn main() -> Result<()> { Ok(()) } - -use std::io::Write; diff --git a/vortex-tensor/src/vector_search.rs b/vortex-tensor/src/vector_search.rs index 37adbf27a61..6934fa52a71 100644 --- a/vortex-tensor/src/vector_search.rs +++ b/vortex-tensor/src/vector_search.rs @@ -57,7 +57,6 @@ use vortex_array::dtype::extension::ExtDType; use vortex_array::extension::EmptyMetadata; use vortex_array::scalar::Scalar; use vortex_array::scalar_fn::fns::operators::Operator; -use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_bail; @@ -159,9 +158,7 @@ pub fn build_similarity_search_tree( let num_rows = data.len(); let query_vec = build_constant_query_vector(query, num_rows)?; - let cosine = CosineSimilarity::try_new_array(data, query_vec, num_rows) - .vortex_expect("cosine similarity accepts two matching Vector extension arrays") - .into_array(); + let cosine = CosineSimilarity::try_new_array(data, query_vec, num_rows)?.into_array(); let threshold_scalar = Scalar::primitive(threshold, Nullability::NonNullable); let threshold_array = ConstantArray::new(threshold_scalar, num_rows).into_array(); From 00e88ea58324f88c9e82d428abc8252e6d6ca630 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 11 Apr 2026 15:04:59 +0000 Subject: [PATCH 12/18] vector-search-bench: remove dead code from review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleanup of seven dead-or-suppressed items the review flagged: 1. `QueryLen` type alias (`lib.rs`) — defined but only "used" via `let _ = QueryLen::default;`, a warning-suppression hack. The doc comment claimed it made "broadcast semantics obvious at call sites" but no call site actually referenced it. Both the alias and the suppression `let` are gone. 2. `async-trait` and `itertools` Cargo dependencies — listed in `vector-search-bench/Cargo.toml` but not referenced anywhere in the crate's source tree (probably copied from `compress-bench` when the crate was bootstrapped). Removed. 3. `VERIFICATION_QUERY_ROW` constant (`verify.rs`) — defined as `0` and documented as "the row we verify against" but never referenced from anywhere; `main.rs` passes `prepared.query` directly, and `prepared.query` is extracted at `DEFAULT_QUERY_ROW` time inside `prepare_dataset`. Removed. 4. Dead-code-suppression hack in `vortex_bench::conversions::tests`: // ...suppress the dead-code warning by referencing them in a // no-op expression. let _ = (Nullability::NonNullable, PType::I32); Both imports were never used in the test body. Removed the imports and the dead let. 5. `let _ = emb_idx;` in `handrolled_baseline::read_parquet_embedding_column`. Replaced with destructuring via `let (_, emb_field) = ...`. 6. `HandrolledBaselineData::file_size` — populated by `read_parquet_ embedding_column` via a second `File::metadata()` call, but no caller reads it: the benchmark's "handrolled size" measurement reads `PreparedDataset::parquet_bytes` (populated once in `prepare_dataset`). Keeping the duplicated state was a recipe for drift. Removed the field and its populating code, and added a doc comment on the struct pointing at `parquet_bytes` as the canonical file-size source. All 13 vector-search-bench tests + 16 vortex-bench tests still pass. clippy / rustfmt clean. Signed-off-by: Claude Signed-off-by: Connor Tsui --- Cargo.lock | 2 -- benchmarks/vector-search-bench/Cargo.toml | 2 -- .../src/handrolled_baseline.rs | 16 +++++++--------- benchmarks/vector-search-bench/src/lib.rs | 7 ------- benchmarks/vector-search-bench/src/verify.rs | 5 ----- vortex-bench/src/conversions.rs | 6 ------ 6 files changed, 7 insertions(+), 31 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ebc8530825a..aa4519e13ab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10060,10 +10060,8 @@ dependencies = [ "arrow-array 58.0.0", "arrow-buffer 58.0.0", "arrow-schema 58.0.0", - "async-trait", "clap", "indicatif", - "itertools 0.14.0", "parquet 58.0.0", "tempfile", "tokio", diff --git a/benchmarks/vector-search-bench/Cargo.toml b/benchmarks/vector-search-bench/Cargo.toml index 8e8101ddcee..bbc620572b5 100644 --- a/benchmarks/vector-search-bench/Cargo.toml +++ b/benchmarks/vector-search-bench/Cargo.toml @@ -19,10 +19,8 @@ anyhow = { workspace = true } arrow-array = { workspace = true } arrow-buffer = { workspace = true } arrow-schema = { workspace = true } -async-trait = { workspace = true } clap = { workspace = true, features = ["derive"] } indicatif = { workspace = true } -itertools = { workspace = true } parquet = { workspace = true } tokio = { workspace = true, features = ["full"] } tracing = { workspace = true } diff --git a/benchmarks/vector-search-bench/src/handrolled_baseline.rs b/benchmarks/vector-search-bench/src/handrolled_baseline.rs index 85267c358e2..260d4726857 100644 --- a/benchmarks/vector-search-bench/src/handrolled_baseline.rs +++ b/benchmarks/vector-search-bench/src/handrolled_baseline.rs @@ -69,11 +69,10 @@ use crate::VariantTimings; pub fn read_parquet_embedding_column(parquet_path: &Path) -> Result { let file = File::open(parquet_path) .with_context(|| format!("open parquet file {}", parquet_path.display()))?; - let file_size = file.metadata()?.len(); let builder = ParquetRecordBatchReaderBuilder::try_new(file)?; // Locate the `emb` column and sanity-check its type. - let (emb_idx, emb_field) = builder + let (_, emb_field) = builder .schema() .column_with_name("emb") .context("parquet schema missing `emb` column")?; @@ -93,7 +92,6 @@ pub fn read_parquet_embedding_column(parquet_path: &Path) -> Result = reader.collect::, _>>()?; @@ -114,7 +112,6 @@ pub fn read_parquet_embedding_column(parquet_path: &Path) -> Result, new_dim: usize) -> Result<()> /// The flattened representation of an embedding column, suitable for a hand-rolled /// distance loop. Intentionally decoupled from any format — the compute side doesn't /// care how the data got into this `Vec`. +/// +/// The benchmark's "size" measurement for the handrolled baseline comes from +/// [`crate::PreparedDataset::parquet_bytes`] (which is populated once in +/// [`crate::prepare_dataset`]), not from this struct. We deliberately don't carry +/// the file size in here — doing so would duplicate state between two places that +/// can go out of sync. pub struct HandrolledBaselineData { /// All rows concatenated: `elements.len() == num_rows * dim`. pub elements: Vec, @@ -182,11 +185,6 @@ pub struct HandrolledBaselineData { pub dim: usize, /// Number of rows. pub num_rows: usize, - /// On-disk size of the parquet file in bytes. Reported as the "handrolled size" - /// measurement because parquet is the only format this benchmark serializes to - /// on disk today — a future v2 could report additional storage formats (raw f32 - /// blob, Arrow IPC, etc.) alongside it. - pub file_size: u64, } /// Run the decompress / cosine / filter microbenchmarks for the hand-rolled baseline diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs index bfa0aa55c8d..7be88e7b734 100644 --- a/benchmarks/vector-search-bench/src/lib.rs +++ b/benchmarks/vector-search-bench/src/lib.rs @@ -112,11 +112,6 @@ impl Variant { } } -/// Number of rows in the query vector — matches the database so `ScalarFnArray`'s -/// equal-length contract is satisfied. This type alias exists to make the broadcast -/// semantics obvious at call sites. -type QueryLen = usize; - /// A materialized Vortex array and its associated execution session / context. pub struct PreparedDataset { /// Name used in metric strings — usually the dataset's `Dataset::name()`. @@ -361,8 +356,6 @@ pub fn run_timings( iterations: usize, session: &VortexSession, ) -> Result { - let _ = QueryLen::default; // touch the type alias so rustc doesn't warn - let mut decompress = Duration::MAX; let mut cosine = Duration::MAX; let mut filter = Duration::MAX; diff --git a/benchmarks/vector-search-bench/src/verify.rs b/benchmarks/vector-search-bench/src/verify.rs index 7dd6a564f36..edac6d1fc27 100644 --- a/benchmarks/vector-search-bench/src/verify.rs +++ b/benchmarks/vector-search-bench/src/verify.rs @@ -41,11 +41,6 @@ pub const LOSSLESS_TOLERANCE: f32 = 1e-4; /// that catches regressions without flaking on distribution-specific noise. pub const LOSSY_TOLERANCE: f32 = 0.2; -/// Row index used to pick the verification query. Row 0 is also what -/// [`crate::DEFAULT_QUERY_ROW`] selects, so the verification and timing paths exercise -/// identical inputs. -pub const VERIFICATION_QUERY_ROW: usize = 0; - /// How lossy a variant is allowed to be when its scores are compared to the /// uncompressed baseline. #[derive(Debug, Clone, Copy, PartialEq, Eq)] diff --git a/vortex-bench/src/conversions.rs b/vortex-bench/src/conversions.rs index e31c3520d2f..d640f583ee9 100644 --- a/vortex-bench/src/conversions.rs +++ b/vortex-bench/src/conversions.rs @@ -372,8 +372,6 @@ mod tests { use vortex::array::validity::Validity; use vortex::buffer::BufferMut; use vortex::dtype::DType; - use vortex::dtype::Nullability; - use vortex::dtype::PType; use super::list_to_vector_ext; @@ -463,9 +461,5 @@ mod tests { err.contains("element type must be float"), "unexpected error: {err}", ); - - // The unused Nullability / PType imports exist to make the intent clear; suppress - // the dead-code warning by referencing them in a no-op expression. - let _ = (Nullability::NonNullable, PType::I32); } } From 3f026220bb9b5da50a5cbec0a08b7a02fd0b01ad Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 11 Apr 2026 15:08:18 +0000 Subject: [PATCH 13/18] vector-search-bench: dedupe helpers and consolidate verify paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four related cleanups the review flagged for modularity & clarity: 1. Deleted the duplicate `extract_query_row` helper from `recall.rs` (30 lines near-identical to the one in `lib.rs`). Both call sites now go through a single `pub(crate)` helper in `lib.rs`. The lib.rs helper also gets a fresh doc comment explicitly calling out its f32-only requirement (landed in the previous commit) so the deduped version is not just shorter but more correct. 2. Added `verify::verify_and_report_scores`: a score-vector-first variant of `verify_variant` that takes pre-computed scores as `&[f32]` instead of an `ArrayRef`. `verify_variant` now delegates to it. The point is to make both main-loop paths share a single error-handling / logging / bail-on-lossless codepath: - Vortex variants compute scores via `compute_cosine_scores(array)` and call `verify_variant`, which then calls `verify_and_report_scores` under the hood. - The handrolled baseline computes scores via `cosine_loop(&[f32])` and calls `verify_and_report_scores` directly. Previously `main.rs` hand-rolled the handrolled path's check/log/ bail triplet, duplicating logic in two places that could drift. 3. Dropped the `compress_turboquant` and `build_similarity_search_tree` wrapper shims from `vortex-tensor/benches/similarity_search_common/ mod.rs`. They existed only to re-expose the public helpers under local names for bench call-site compatibility. The bench now `pub use`s them directly, which is one fewer indirection layer. `compress_default` stays in the bench module because its `BtrBlocksCompressor` dependency lives in vortex-tensor's dev-dependencies, not main deps, so lifting it to the public `vector_search` module would pull `vortex-btrblocks` into vortex-tensor's main dep graph. 4. Replaced the stale "in future commits" doc comment on `PreparedDataset::uncompressed` — recall is already wired. All 13 vector-search-bench + 222 vortex-tensor tests pass. clippy / rustfmt clean. Signed-off-by: Claude Signed-off-by: Connor Tsui --- benchmarks/vector-search-bench/src/lib.rs | 8 ++-- benchmarks/vector-search-bench/src/main.rs | 43 ++++++++----------- benchmarks/vector-search-bench/src/recall.rs | 33 +------------- benchmarks/vector-search-bench/src/verify.rs | 41 +++++++++++++----- .../benches/similarity_search_common/mod.rs | 27 +++--------- 5 files changed, 62 insertions(+), 90 deletions(-) diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs index 7be88e7b734..37cc3cb0ef2 100644 --- a/benchmarks/vector-search-bench/src/lib.rs +++ b/benchmarks/vector-search-bench/src/lib.rs @@ -116,12 +116,14 @@ impl Variant { pub struct PreparedDataset { /// Name used in metric strings — usually the dataset's `Dataset::name()`. pub name: String, - /// Uncompressed `Vector` array (canonical form). This is reused as the - /// ground-truth basis for TurboQuant recall checks in future commits. + /// Uncompressed `Vector` array (canonical form). Doubles as the + /// ground-truth basis for the correctness-verification pass and for TurboQuant's + /// Recall@K quality measurement. pub uncompressed: ArrayRef, /// The query vector to use (a single row pulled from the dataset). pub query: Vec, - /// Parquet file size on disk in bytes — produced by the dataset download step. + /// Parquet file size on disk in bytes — produced by the dataset download step + /// and reused as the "handrolled size" measurement in main.rs. pub parquet_bytes: u64, } diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs index e6ad3ad8357..7cae73cf823 100644 --- a/benchmarks/vector-search-bench/src/main.rs +++ b/benchmarks/vector-search-bench/src/main.rs @@ -29,6 +29,8 @@ use clap::Parser; use indicatif::ProgressBar; use vector_search_bench::DEFAULT_THRESHOLD; use vector_search_bench::Variant; +use vector_search_bench::handrolled_baseline::cosine_loop; +use vector_search_bench::handrolled_baseline::read_parquet_embedding_column; use vector_search_bench::handrolled_baseline::run_handrolled_baseline_timings; use vector_search_bench::prepare_dataset; use vector_search_bench::prepare_variant; @@ -37,6 +39,7 @@ use vector_search_bench::recall::measure_recall_at_k; use vector_search_bench::run_timings; use vector_search_bench::verify::VerificationKind; use vector_search_bench::verify::compute_cosine_scores; +use vector_search_bench::verify::verify_and_report_scores; use vector_search_bench::verify::verify_variant; use vortex_bench::Format; use vortex_bench::SESSION; @@ -221,40 +224,35 @@ async fn main() -> Result<()> { // from, say, a DuckDB `list_cosine_similarity` baseline on the same parquet. if run_handrolled_baseline { let parquet_path = dataset.to_parquet_path().await?; - let baseline_data = - vector_search_bench::handrolled_baseline::read_parquet_embedding_column( - &parquet_path, - ) + let label = "handrolled"; + let bench_name = format!("{label}/{}", prepared.name); + + // Verify the handrolled cosine scores against the Vortex baseline before + // any timing starts. `verify_and_report_scores` is the same helper the + // Vortex-variant loop ends up calling through `verify_variant`, so the + // two paths share all their pass/fail / log / bail logic. + let baseline_data = read_parquet_embedding_column(&parquet_path) .context("read parquet emb column for verification")?; - let handrolled_scores = vector_search_bench::handrolled_baseline::cosine_loop( + let handrolled_scores = cosine_loop( &baseline_data.elements, baseline_data.num_rows, baseline_data.dim, &prepared.query, ); - let handrolled_report = vector_search_bench::verify::verify_scores( - &baseline_scores, + let handrolled_report = verify_and_report_scores( + &bench_name, &handrolled_scores, + &baseline_scores, VerificationKind::Lossless, - ); - if !handrolled_report.passed { - anyhow::bail!( - "handrolled baseline correctness check failed on {}: \ - max_abs_diff={:.6}, mean_abs_diff={:.6}, tolerance={:.6}", - prepared.name, - handrolled_report.max_abs_diff, - handrolled_report.mean_abs_diff, - handrolled_report.tolerance(), - ); - } + )?; tracing::info!( - "handrolled/{} verification: max_abs_diff={:.2e}, mean_abs_diff={:.2e}", - prepared.name, + "{} verification (Lossless): max_abs_diff={:.2e}, mean_abs_diff={:.2e}", + bench_name, handrolled_report.max_abs_diff, handrolled_report.mean_abs_diff, ); verification.push(CustomUnitMeasurement { - name: format!("correctness-max-diff/handrolled/{}", prepared.name), + name: format!("correctness-max-diff/{bench_name}"), format: Format::Parquet, unit: Cow::from("abs-diff"), value: handrolled_report.max_abs_diff, @@ -267,9 +265,6 @@ async fn main() -> Result<()> { args.iterations, )?; - let label = "handrolled"; - let bench_name = format!("{label}/{}", prepared.name); - sizes.push(CustomUnitMeasurement { name: format!("{label} size/{}", prepared.name), format: Format::Parquet, diff --git a/benchmarks/vector-search-bench/src/recall.rs b/benchmarks/vector-search-bench/src/recall.rs index 1c587616d4e..c091b74305e 100644 --- a/benchmarks/vector-search-bench/src/recall.rs +++ b/benchmarks/vector-search-bench/src/recall.rs @@ -16,12 +16,10 @@ use anyhow::Result; use vortex::array::ArrayRef; -use vortex::array::VortexSessionExecute; -use vortex::array::arrays::PrimitiveArray; -use vortex::array::arrays::extension::ExtensionArrayExt; use vortex::session::VortexSession; use vortex::utils::aliases::hash_set::HashSet; +use crate::extract_query_row; use crate::verify::compute_cosine_scores; /// Size of the neighbour set we compare. 10 is the standard VectorDBBench default. @@ -60,7 +58,7 @@ pub fn measure_recall_at_k( for q in 0..num_queries { let row = (q * step).min(num_rows - 1); - let query = extract_query_row(uncompressed, row, session)?; + let query = extract_query_row(uncompressed, row)?; let gt_scores = compute_cosine_scores(uncompressed, &query, session)?; let truth = top_k_indices(>_scores, top_k); @@ -76,33 +74,6 @@ pub fn measure_recall_at_k( Ok(total_hits as f64 / total_checked as f64) } -fn extract_query_row( - vector_ext: &ArrayRef, - row: usize, - session: &VortexSession, -) -> Result> { - use anyhow::Context; - use vortex::array::arrays::Extension; - use vortex::array::arrays::FixedSizeListArray; - use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt; - - let mut ctx = session.create_execution_ctx(); - let ext = vector_ext - .as_opt::() - .context("extract_query_row expects an Extension array")?; - let fsl: FixedSizeListArray = ext.storage_array().clone().execute(&mut ctx)?; - - let dim_usize = match fsl.dtype() { - vortex::dtype::DType::FixedSizeList(_, dim, _) => *dim as usize, - other => anyhow::bail!("expected FixedSizeList storage, got {other}"), - }; - - let elements: PrimitiveArray = fsl.elements().clone().execute(&mut ctx)?; - let slice = elements.as_slice::(); - let start = row * dim_usize; - Ok(slice[start..start + dim_usize].to_vec()) -} - /// Return the indices of the top-K highest scores, stable-sorted descending. fn top_k_indices(scores: &[f32], top_k: usize) -> Vec { let mut idx: Vec = (0..scores.len()).collect(); diff --git a/benchmarks/vector-search-bench/src/verify.rs b/benchmarks/vector-search-bench/src/verify.rs index edac6d1fc27..cbca8a3adc5 100644 --- a/benchmarks/vector-search-bench/src/verify.rs +++ b/benchmarks/vector-search-bench/src/verify.rs @@ -146,21 +146,23 @@ pub fn verify_scores( } } -/// End-to-end variant verification: executes cosine on `variant_array` against the -/// same query used for the baseline and returns a [`VerificationReport`]. Returns -/// `Err` if `kind` is [`VerificationKind::Lossless`] and the scores disagree beyond -/// [`LOSSLESS_TOLERANCE`] — that indicates a real correctness bug, not a quality -/// tradeoff. -pub fn verify_variant( +/// Verify pre-computed scores against a baseline and enforce the tolerance band. +/// +/// Takes already-materialized `variant_scores` (as a `&[f32]`) rather than an +/// `ArrayRef`, so both the Vortex-variant path (which computes scores via +/// [`execute_cosine`](crate::execute_cosine)) and the hand-rolled baseline path (which +/// runs a plain Rust loop over a flat `Vec`) share the same error-handling, +/// logging, and hard-fail logic without duplicating it in `main.rs`. +/// +/// Lossless mismatches bail the run with an error; lossy mismatches log a warning +/// but let the run continue so the recall measurement is still reported. +pub fn verify_and_report_scores( variant_name: &str, - variant_array: &ArrayRef, - query: &[f32], + variant_scores: &[f32], baseline_scores: &[f32], kind: VerificationKind, - session: &VortexSession, ) -> Result { - let scores = compute_cosine_scores(variant_array, query, session)?; - let report = verify_scores(baseline_scores, &scores, kind); + let report = verify_scores(baseline_scores, variant_scores, kind); if !report.passed { let message = format!( @@ -182,6 +184,23 @@ pub fn verify_variant( Ok(report) } +/// End-to-end variant verification: executes cosine on `variant_array` against the +/// same query used for the baseline and returns a [`VerificationReport`]. Returns +/// `Err` if `kind` is [`VerificationKind::Lossless`] and the scores disagree beyond +/// [`LOSSLESS_TOLERANCE`] — that indicates a real correctness bug, not a quality +/// tradeoff. +pub fn verify_variant( + variant_name: &str, + variant_array: &ArrayRef, + query: &[f32], + baseline_scores: &[f32], + kind: VerificationKind, + session: &VortexSession, +) -> Result { + let scores = compute_cosine_scores(variant_array, query, session)?; + verify_and_report_scores(variant_name, &scores, baseline_scores, kind) +} + #[cfg(test)] mod tests { use vortex_bench::SESSION; diff --git a/vortex-tensor/benches/similarity_search_common/mod.rs b/vortex-tensor/benches/similarity_search_common/mod.rs index ee0bab128ed..b95867b26b7 100644 --- a/vortex-tensor/benches/similarity_search_common/mod.rs +++ b/vortex-tensor/benches/similarity_search_common/mod.rs @@ -47,8 +47,8 @@ use vortex_error::VortexResult; use vortex_error::vortex_panic; use vortex_session::VortexSession; use vortex_tensor::vector::Vector; -use vortex_tensor::vector_search::build_similarity_search_tree as public_build_similarity_search_tree; -use vortex_tensor::vector_search::compress_turboquant as public_compress_turboquant; +pub use vortex_tensor::vector_search::build_similarity_search_tree; +pub use vortex_tensor::vector_search::compress_turboquant; /// A shared [`VortexSession`] pre-loaded with the builtin [`ArraySession`] so both bench and /// example can create execution contexts cheaply. @@ -141,17 +141,14 @@ pub fn extract_row_as_query(vectors: &ArrayRef, row: usize, dim: u32) -> Vec VortexResult { BtrBlocksCompressor::default().compress(&data) } -/// Compresses a raw `Vector` array with the TurboQuant pipeline. This is a thin -/// wrapper around [`vortex_tensor::vector_search::compress_turboquant`] preserved for bench -/// call-site compatibility. -pub fn compress_turboquant(data: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { - public_compress_turboquant(data, ctx) -} - /// Dispatch helper that builds the data array for the requested [`Variant`], starting from a /// single random-vector generation. Always returns an `ArrayRef` whose logical dtype is /// `Vector`. @@ -169,15 +166,3 @@ pub fn build_variant( Variant::TurboQuant => compress_turboquant(raw, ctx), } } - -/// Build the lazy similarity-search array tree for a prepared data array and a single query -/// vector. Thin wrapper around -/// [`vortex_tensor::vector_search::build_similarity_search_tree`] preserved for bench -/// call-site compatibility. -pub fn build_similarity_search_tree( - data: ArrayRef, - query: &[f32], - threshold: f32, -) -> VortexResult { - public_build_similarity_search_tree(data, query, threshold) -} From cbcb078d608c709cfcac80883d90f2cd1b99bf94 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 11 Apr 2026 15:09:40 +0000 Subject: [PATCH 14/18] vector-search-bench: optimize offset validation, interleave timings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two micro-refactors for correctness quality: 1. `list_to_vector_ext` offset validation walks `ListArrayExt::offset_at` N+1 times (once per row boundary) instead of 2N times (once per start and once per end of every row). Each iteration now carries the previous `end` into the next iteration's `prev_end`, so we only fetch each offset boundary once. For a canonical list produced by `parquet_to_vortex_chunks` the offsets are a `Primitive` array and `offset_at` takes the fast slice-index path, so this halves the offset lookups from ~200K to ~100K on a 100K-row column. The loop body is still O(1) per row either way. 2. `run_timings` now interleaves its decompress / cosine / filter stages inside a single outer loop over iterations, instead of running the three stages as back-to-back sequential loops. The back-to-back shape gives each stage an asymmetric cache profile — the second+ iterations of the cosine stage run on cache lines left behind by the previous *cosine* iteration, while the second+ iterations of the filter stage run on cache lines left behind by the *cosine* stage. Interleaving makes each iteration of each stage see roughly the same cache state, removing the asymmetry. Each stage still gets a fresh `ExecutionCtx` so no cached scalar-fn state leaks between stages within a single iteration. The doc comment on `run_timings` now spells out why interleaving matters. All 13 vector-search-bench + 16 vortex-bench tests still pass. Signed-off-by: Claude Signed-off-by: Connor Tsui --- benchmarks/vector-search-bench/src/lib.rs | 55 ++++++++++++++--------- vortex-bench/src/conversions.rs | 15 +++++-- 2 files changed, 45 insertions(+), 25 deletions(-) diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs index 37cc3cb0ef2..d5d5df1f1a2 100644 --- a/benchmarks/vector-search-bench/src/lib.rs +++ b/benchmarks/vector-search-bench/src/lib.rs @@ -352,6 +352,18 @@ pub fn prepare_variant( /// Run the decompress / cosine / filter microbenchmarks against a prepared variant /// array and return the best-of-`iterations` wall times for each measurement. +/// +/// The three stages are **interleaved** inside a single outer loop rather than run +/// as three separate back-to-back loops. Interleaving keeps each stage's cache / +/// branch-predictor / allocator state symmetric across iterations — a pathology of +/// the back-to-back shape is that iteration `N+1` of the cosine stage runs on +/// warmed caches left behind by iteration `N` of the cosine stage, while iteration +/// `N+1` of the filter stage runs on caches left behind by the *cosine* stage. The +/// interleaved form makes each stage see roughly the same cache state every +/// iteration. +/// +/// Each stage still gets a fresh `ExecutionCtx`, so no cached scalar-fn state leaks +/// between stages within a single iteration. pub fn run_timings( variant_array: &ArrayRef, query: &[f32], @@ -363,27 +375,28 @@ pub fn run_timings( let mut filter = Duration::MAX; for _ in 0..iterations { - let mut ctx = session.create_execution_ctx(); - let start = Instant::now(); - let decoded: FixedSizeListArray = decompress_full_scan(variant_array, &mut ctx)?; - decompress = decompress.min(start.elapsed()); - drop(decoded); - } - - for _ in 0..iterations { - let mut ctx = session.create_execution_ctx(); - let start = Instant::now(); - let scores: PrimitiveArray = execute_cosine(variant_array, query, &mut ctx)?; - cosine = cosine.min(start.elapsed()); - drop(scores); - } - - for _ in 0..iterations { - let mut ctx = session.create_execution_ctx(); - let start = Instant::now(); - let matches: BoolArray = execute_filter(variant_array, query, DEFAULT_THRESHOLD, &mut ctx)?; - filter = filter.min(start.elapsed()); - drop(matches); + { + let mut ctx = session.create_execution_ctx(); + let start = Instant::now(); + let decoded: FixedSizeListArray = decompress_full_scan(variant_array, &mut ctx)?; + decompress = decompress.min(start.elapsed()); + drop(decoded); + } + { + let mut ctx = session.create_execution_ctx(); + let start = Instant::now(); + let scores: PrimitiveArray = execute_cosine(variant_array, query, &mut ctx)?; + cosine = cosine.min(start.elapsed()); + drop(scores); + } + { + let mut ctx = session.create_execution_ctx(); + let start = Instant::now(); + let matches: BoolArray = + execute_filter(variant_array, query, DEFAULT_THRESHOLD, &mut ctx)?; + filter = filter.min(start.elapsed()); + drop(matches); + } } Ok(VariantTimings { diff --git a/vortex-bench/src/conversions.rs b/vortex-bench/src/conversions.rs index d640f583ee9..1a486034919 100644 --- a/vortex-bench/src/conversions.rs +++ b/vortex-bench/src/conversions.rs @@ -317,19 +317,25 @@ pub fn list_to_vector_ext(input: ArrayRef) -> VortexResult { vortex_bail!("list_to_vector_ext: cannot infer vector dimension from empty input"); } - let first_start = list.offset_at(0)?; + // Walk the offsets array once, reusing the previous iteration's `end` as the + // next iteration's `start`. Each `offset_at` call goes through + // `ListArrayExt::offset_at`, which has a fast path when the offsets child is a + // `Primitive` array (direct slice index). That's the common case after + // `parquet_to_vortex_chunks`, so for a 100K-row column we do ~100K primitive + // slice indexes rather than 200K. The loop body is O(1) either way. + let mut prev_end = list.offset_at(0)?; let first_end = list.offset_at(1)?; - let dim = first_end.checked_sub(first_start).ok_or_else(|| { + let dim = first_end.checked_sub(prev_end).ok_or_else(|| { vortex_err!("list_to_vector_ext: offsets are not monotonically increasing") })?; if dim == 0 { vortex_bail!("list_to_vector_ext: first row has zero elements"); } + prev_end = first_end; for i in 1..num_rows { - let start = list.offset_at(i)?; let end = list.offset_at(i + 1)?; - let row_len = end.checked_sub(start).ok_or_else(|| { + let row_len = end.checked_sub(prev_end).ok_or_else(|| { vortex_err!("list_to_vector_ext: offsets are not monotonically increasing") })?; if row_len != dim { @@ -340,6 +346,7 @@ pub fn list_to_vector_ext(input: ArrayRef) -> VortexResult { dim ); } + prev_end = end; } let elements = list.sliced_elements()?; From 93082288be02233bef021d93e9000a4683f7778e Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 11 Apr 2026 15:12:31 +0000 Subject: [PATCH 15/18] vortex-bench: add ListView test, make VectorDataset::to_vortex_array bail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two review items related to testing coverage and trait-contract honesty: 1. `list_to_vector_ext` has five unit tests but none of them exercised the `ListView` → `List` fast path, even though that path is the one the benchmark actually hits after `parquet_to_vortex_chunks` (which canonicalizes list columns into `ListView`, not `List`). Added a `list_view_input_is_rewrapped_as_vector_extension` test with a `list_view_f32(dim, rows)` helper that constructs a `ListViewArray` with uniform-stride offsets and sizes, then asserts `list_to_vector_ext` returns an `Extension` whose storage is a `FixedSizeList`. A regression to the ListView handling would now be caught by unit tests rather than by the end-to-end benchmark smoke test. `vortex-bench` tests are now 17 (was 16). 2. `VectorDataset::to_vortex_array` previously implemented the `Dataset` trait by downloading the parquet file, writing it through the default Vortex write path, and reading it back. The returned array was a `StructArray` with `{ id: int64, emb: list }` — **not** the `Extension(FixedSizeList<...>)` shape the vector-search benchmark actually operates on. The benchmark worked around this by bypassing `to_vortex_array` entirely and rebuilding the Vector extension via `parquet_to_vortex_chunks` + `list_to_vector_ext`. But that left the trait method as a trap for future callers who expect it to return the same shape the benchmark measures. Replaced the body with `bail!` and a message that points callers at the parquet path + list_to_vector_ext sequence. This makes the contract unambiguous: either use the parquet path, or get a clear error instead of a semantically-wrong array. The unused imports (`tokio::fs::File`, `ArrayStreamExt`, `OpenOptionsSessionExt`, `WriteOptionsSessionExt`, `SESSION`, `parquet_to_vortex_chunks`, `idempotent_async`, `IntoArray`) are dropped too. 13 vector-search-bench + 17 vortex-bench tests pass. Signed-off-by: Claude Signed-off-by: Connor Tsui --- vortex-bench/src/conversions.rs | 55 ++++++++++++++++++++++++++++ vortex-bench/src/vector_dataset.rs | 58 ++++++++++++------------------ 2 files changed, 77 insertions(+), 36 deletions(-) diff --git a/vortex-bench/src/conversions.rs b/vortex-bench/src/conversions.rs index 1a486034919..52fc1571a6a 100644 --- a/vortex-bench/src/conversions.rs +++ b/vortex-bench/src/conversions.rs @@ -374,6 +374,7 @@ mod tests { use vortex::array::IntoArray; use vortex::array::arrays::Extension; use vortex::array::arrays::List; + use vortex::array::arrays::ListViewArray; use vortex::array::arrays::PrimitiveArray; use vortex::array::arrays::extension::ExtensionArrayExt; use vortex::array::validity::Validity; @@ -447,6 +448,60 @@ mod tests { ); } + /// Build a `ListView` whose every row is a length-`dim` slice of the flattened + /// `values` buffer. This shape matches what `parquet_to_vortex_chunks` produces for + /// embedding columns after arrow-rs' canonicalization, and exercises the + /// `list_to_vector_ext` fast-path that collapses `ListView` → `List` before + /// validating offsets. + fn list_view_f32(dim: usize, rows: &[&[f32]]) -> vortex::array::ArrayRef { + let mut values = BufferMut::::with_capacity(rows.len() * dim); + for row in rows { + assert_eq!(row.len(), dim); + for &v in row.iter() { + values.push(v); + } + } + let elements = + PrimitiveArray::new::(values.freeze(), Validity::NonNullable).into_array(); + + let dim_i32 = i32::try_from(dim).unwrap(); + let num_rows = rows.len(); + + let mut offsets_buf = BufferMut::::with_capacity(num_rows); + for i in 0..num_rows { + offsets_buf.push(i32::try_from(i).unwrap() * dim_i32); + } + let offsets = + PrimitiveArray::new::(offsets_buf.freeze(), Validity::NonNullable).into_array(); + + let mut sizes_buf = BufferMut::::with_capacity(num_rows); + for _ in 0..num_rows { + sizes_buf.push(dim_i32); + } + let sizes = + PrimitiveArray::new::(sizes_buf.freeze(), Validity::NonNullable).into_array(); + + ListViewArray::try_new(elements, offsets, sizes, Validity::NonNullable) + .unwrap() + .into_array() + } + + #[test] + fn list_view_input_is_rewrapped_as_vector_extension() { + // Simulates the post-parquet-ingest shape: the `emb` column arrives as a + // ListView, not a List. `list_to_vector_ext` must materialize it via + // `recursive_list_from_list_view` and then validate offsets on the flattened + // `List` form. + let list_view = list_view_f32(3, &[&[1.0, 2.0, 3.0], &[4.0, 5.0, 6.0]]); + let wrapped = list_to_vector_ext(list_view).unwrap(); + assert_eq!(wrapped.len(), 2); + let ext = wrapped.as_opt::().expect("returns Extension"); + assert!(matches!( + ext.storage_array().dtype(), + DType::FixedSizeList(_, 3, _) + )); + } + #[test] fn non_float_element_type_is_rejected() { // Build a List. diff --git a/vortex-bench/src/vector_dataset.rs b/vortex-bench/src/vector_dataset.rs index 86086ac57ac..e6b049fe614 100644 --- a/vortex-bench/src/vector_dataset.rs +++ b/vortex-bench/src/vector_dataset.rs @@ -18,20 +18,13 @@ use std::path::PathBuf; use anyhow::Result; +use anyhow::bail; use async_trait::async_trait; -use tokio::fs::File; use vortex::array::ArrayRef; -use vortex::array::IntoArray; -use vortex::array::stream::ArrayStreamExt; -use vortex::file::OpenOptionsSessionExt; -use vortex::file::WriteOptionsSessionExt; use crate::IdempotentPath; -use crate::SESSION; -use crate::conversions::parquet_to_vortex_chunks; use crate::datasets::Dataset; use crate::datasets::data_downloads::download_data; -use crate::idempotent_async; /// A public embedding-vector dataset used by the vector-search benchmark. /// @@ -171,35 +164,28 @@ impl Dataset for VectorDataset { Ok(parquet) } + /// **Not supported.** `VectorDataset` can't return a straight Vortex array via + /// [`Dataset::to_vortex_array`] because: + /// + /// - The struct-shaped array the other datasets return would arrive as + /// `{ id: int64, emb: list }` — with `emb` as a *list*, not the + /// `Extension(FixedSizeList<...>)` shape the vector-search benchmark + /// actually operates on. + /// - The benchmark therefore bypasses this method entirely: it calls + /// [`Dataset::to_parquet_path`] and then runs + /// [`crate::conversions::parquet_to_vortex_chunks`] + + /// [`crate::conversions::list_to_vector_ext`] itself, which produces the + /// correct `Extension` shape. + /// + /// Returning the raw struct here would be a trap for future callers who expect + /// the same semantic shape the benchmark measures. Bailing explicitly makes the + /// contract unambiguous. async fn to_vortex_array(&self) -> Result { - let parquet = self.to_parquet_path().await?; - let dir = format!("{}/", self.name()).to_data_path(); - let vortex = dir.join(format!("{}.vortex", self.name())); - - let data = parquet_to_vortex_chunks(parquet).await?; - idempotent_async(&vortex, async |path| -> Result<()> { - SESSION - .write_options() - .write( - &mut File::create(path) - .await - .map_err(|e| anyhow::anyhow!("Failed to create file: {}", e))?, - data.into_array().to_array_stream(), - ) - .await - .map_err(|e| anyhow::anyhow!("Failed to write vortex file: {}", e))?; - Ok(()) - }) - .await?; - - Ok(SESSION - .open_options() - .open_path(vortex.as_path()) - .await? - .scan()? - .into_array_stream()? - .read_all() - .await?) + bail!( + "VectorDataset::to_vortex_array is not supported; use `to_parquet_path` + \ + `parquet_to_vortex_chunks` + `list_to_vector_ext` to build the \ + Extension shape the benchmark needs" + ); } } From 8c1628b5762021954e479d81fb7b41d72fa1b611 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 11 Apr 2026 15:14:20 +0000 Subject: [PATCH 16/18] vector-search-bench: style nits from review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five small style-only cleanups identified during the review pass. Pure documentation + formatting — no behavior change. 1. `list_to_vector_ext` had one error message missing the `"list_to_vector_ext: ..."` prefix every other error used (`"list_to_vector_ext expects a List array ..."` → `"list_to_vector_ext: expected a List array ..."`). The test that asserts the substring is updated too. 2. `handrolled_baseline::filter_loop` now has an explicit comment flagging the `>` comparison as the *strict* greater-than that must stay in sync with the Vortex-side `Operator::Gt` in `build_similarity_search_tree`. A divergence between the two would show up as a max-abs-diff correctness failure for the lossless variants, but the comment makes the invariant explicit so the next person to touch either side thinks about it. 3. `gen_synthetic_dataset.rs` had three magic-number constants in the per-element value computation (`0.00013`, `0.00007`, `0.25`, plus `1.0/32768.0` for the random scale). Promoted them to named constants (`POS_FREQ_ROW`, `POS_FREQ_COL`, `POS_AMPLITUDE`, `RAND_SCALE`) with a comment explaining the intent: the sinusoid mixes the `(row, col)` position so vectors stay distinct even at low bit widths after quantization, and the frequency constants are chosen small and coprime to avoid short-period aliasing over the 100K-row × 1536-col domain. 4. `gen_synthetic_dataset.rs`'s module doc said "bit-identical across runs". That overpromises: the generator uses `f32::sin`, whose last few ULPs are libm/CPU-dependent. Changed to "deterministic... on the same machine" with an explicit note about cross-machine variance. Signed-off-by: Claude Signed-off-by: Connor Tsui --- .../src/bin/gen_synthetic_dataset.rs | 30 ++++++++++++++----- .../src/handrolled_baseline.rs | 6 +++- vortex-bench/src/conversions.rs | 4 +-- 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs b/benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs index 16e96cce5b4..d8a26578bc9 100644 --- a/benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs +++ b/benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs @@ -6,8 +6,10 @@ //! access to `assets.zilliz.com`, and for sandbox / CI environments that block outbound //! HTTPS. //! -//! The generated file is bit-identical across runs for a given `(num_rows, dim, seed)` -//! triple so that downstream benchmark output is reproducible. +//! The generated file is deterministic for a given `(num_rows, dim, seed)` triple, so +//! downstream benchmark output is reproducible across runs on the same machine. Exact +//! bit-for-bit equality across different machines is not guaranteed because the PRNG +//! mixes in `f32::sin()`, whose last few ULPs are libm/CPU-dependent. //! //! Example: //! @@ -81,17 +83,31 @@ fn main() -> Result<()> { let mut offsets = Int32BufferBuilder::new(args.num_rows + 1); offsets.append(0i32); + // Generate per-element values as (random noise in `[-0.5, 0.5)`) + (position-based + // sinusoid of amplitude 0.25). The xorshift gives the random component; the sine + // mixes the row and column index so that vectors at different positions are distinct + // even at low bit widths after quantization. + // + // The sinusoid frequency constants below are deliberately small and coprime so that + // (row, col) → sine values don't repeat across the 100K-row × 1536-col domain any + // faster than ~100K rows. They don't have any particular mathematical meaning — they + // just need to be "slow enough to avoid short-period aliasing, fast enough that + // different rows look different". + const POS_FREQ_ROW: f32 = 0.00013; + const POS_FREQ_COL: f32 = 0.00007; + const POS_AMPLITUDE: f32 = 0.25; + const RAND_SCALE: f32 = 1.0 / 32768.0; + let mut state = args.seed.wrapping_add(1); for row in 0..args.num_rows { for i in 0..dim_usize { - // Deterministic xorshift mixed with position so every vector is distinct. state ^= state << 13; state ^= state >> 7; state ^= state << 17; - let scale = 1.0f32 / 32768.0; - let v = ((state & 0xFFFF) as f32 * scale - 0.5) - + ((row as f32 * 0.00013) + (i as f32 * 0.00007)).sin() * 0.25; - float_values.append_value(v); + let rand_component = (state & 0xFFFF) as f32 * RAND_SCALE - 0.5; + let pos_component = + ((row as f32 * POS_FREQ_ROW) + (i as f32 * POS_FREQ_COL)).sin() * POS_AMPLITUDE; + float_values.append_value(rand_component + pos_component); } let written = i32::try_from((row + 1) * dim_usize) .context("offset overflows i32 — reduce num_rows or dim")?; diff --git a/benchmarks/vector-search-bench/src/handrolled_baseline.rs b/benchmarks/vector-search-bench/src/handrolled_baseline.rs index 260d4726857..bf91915908f 100644 --- a/benchmarks/vector-search-bench/src/handrolled_baseline.rs +++ b/benchmarks/vector-search-bench/src/handrolled_baseline.rs @@ -258,7 +258,11 @@ pub fn cosine_loop(elements: &[f32], num_rows: usize, dim: usize, query: &[f32]) out } -/// Build the `cosine > threshold` boolean mask. +/// Build the `cosine > threshold` boolean mask — **strict greater-than**, matching the +/// Vortex-side path which uses `Operator::Gt` in +/// [`vortex_tensor::vector_search::build_similarity_search_tree`]. Keep these two in +/// sync: if one changes the comparison semantics, the correctness-verification pass will +/// start reporting a mismatch for the lossless variants. pub fn filter_loop(scores: &[f32], threshold: f32) -> Vec { scores.iter().map(|&s| s > threshold).collect() } diff --git a/vortex-bench/src/conversions.rs b/vortex-bench/src/conversions.rs index 52fc1571a6a..8793ed23b9a 100644 --- a/vortex-bench/src/conversions.rs +++ b/vortex-bench/src/conversions.rs @@ -278,7 +278,7 @@ pub fn list_to_vector_ext(input: ArrayRef) -> VortexResult { let Some(list) = input.as_opt::() else { vortex_bail!( - "list_to_vector_ext expects a List array, got dtype {}", + "list_to_vector_ext: expected a List array, got dtype {}", input.dtype() ); }; @@ -433,7 +433,7 @@ mod tests { .into_array(); let err = list_to_vector_ext(primitive).unwrap_err().to_string(); assert!( - err.contains("expects a List array"), + err.contains("expected a List array"), "unexpected error: {err}" ); } From 17f574d74c25db7418b0303f786eb63435fdd7ba Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 11 Apr 2026 16:01:30 +0000 Subject: [PATCH 17/18] =?UTF-8?q?vector-search-bench:=20second=20review=20?= =?UTF-8?q?pass=20=E2=80=94=20missed=20f32=20cast,=20imports,=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A second pass over the branch caught several items I missed in the first review cleanup. All code-style and correctness-level, no behavior changes beyond the missed f32 assertion. ## Missed bug `verify::compute_cosine_scores` had the same blind `PrimitiveArray::as_slice::()` call I already fixed in `extract_query_row` last round — I only patched the lib.rs copy, not the verify.rs one. Added the `scores.ptype() != PType::F32` check with a clear error message before the slice cast. Matches the doc contract that the function returns `Vec` and only supports f32 Vector columns today. ## CLAUDE.md style violations CLAUDE.md's import rule: "All imports must be at the top of the module, never inside functions. The only exception is `#[cfg(test)]` blocks, where imports should be at the top of the test module." I had function-scoped `use` statements sprinkled across five functions in `lib.rs` and inside an `extract_row_zero` test helper in `verify.rs`, all against the guidance. Moved every function-scoped `use` into the top-level import block: - `prepare_dataset`: 1 import moved - `extract_emb_column`: 5 imports moved - `extract_query_row`: 4 imports moved - `decompress_full_scan`: 3 imports moved - `execute_cosine`: 2 imports moved - `PreparedDataset::dim`: 2 fully-qualified `vortex::dtype::DType::...` paths replaced with plain `DType::...` from top-level import ## Test duplication `verify::tests::extract_row_zero` was a 10-line reimplementation of `extract_query_row` inside the test module. Deleted it and made `verify::tests::make_prepared` call `extract_query_row` directly, which also sidestep the need for function-scoped imports in the test. Same rework applied to `lib::tests::prepare_variant_produces_...` (which had its own inline f32 extraction) and `recall::tests::...` (which was building a `PreparedDataset` with an empty query). A new `test_prepared` helper in `lib.rs` tests replaces the three separate inline constructions with a single shared builder. ## New test coverage Added 8 new tests across the three modules: - `vortex-bench::conversions::all_invalid_list_validity_is_rejected`: builds a `List` with `Validity::AllInvalid` and confirms the rejection path fires. Previously unexercised. - `vector-search-bench::tests::extract_query_row_returns_the_right_slice`: verifies row 0 extraction is idempotent and different rows differ. - `vector-search-bench::tests::extract_query_row_rejects_out_of_bounds_row`: verifies the `row >= len` bounds check. - `vector-search-bench::verify::tests::verify_scores_fails_on_nan_in_baseline`: symmetric NaN case — the existing `verify_scores_fails_on_nan` test only covered the variant-side NaN. - `verify::tests::verify_and_report_scores_is_ok_for_identical_inputs`: direct test for the helper that main.rs's handrolled path uses. - `verify::tests::verify_and_report_scores_bails_for_lossless_mismatch`: lossless failures must hard-error with a message that includes the variant name. - `verify::tests::verify_and_report_scores_warns_for_lossy_mismatch_without_bailing`: lossy failures must NOT bail — they log a warning and return the failing report so the caller can still emit it as a measurement. - `recall::tests::top_k_indices_handles_nan_without_panicking`: NaN scores used to sort via `partial_cmp(...).unwrap_or(Equal)`, which produced arbitrary orderings. Switched to `f32::total_cmp` for a proper total order; test confirms non-NaN scores still rank in descending order and the sort doesn't panic. ## Minor - Stale comment on `main.rs::total_work` calculation: referenced `args.formats.len()` but the expression uses a different formula after commit 74aaf5a. Fixed the comment. - `top_k_indices` switched from `scores[b].partial_cmp(&scores[a]).unwrap_or(Ordering::Equal)` to `scores[b].total_cmp(&scores[a])`. Added a doc comment on the function explaining the NaN-safety rationale. ## Test counts after this commit - vortex-bench: 18 tests (was 17, +1 for `all_invalid_list_validity`) - vector-search-bench: 20 tests passing + 1 ignored (was 13 + 1, +7 for the new coverage above) - vortex-tensor: 222 tests (unchanged) Clippy / rustfmt clean. End-to-end smoke run on synthetic cohere-small still reports: - all four correctness-max-diff values coherent (0.000e0 for the three lossless variants, 5.18e-3 for TurboQuant) - Recall@10 = 0.91 for TurboQuant Signed-off-by: Claude Signed-off-by: Connor Tsui --- benchmarks/vector-search-bench/src/lib.rs | 140 ++++++++++--------- benchmarks/vector-search-bench/src/main.rs | 6 +- benchmarks/vector-search-bench/src/recall.rs | 42 +++++- benchmarks/vector-search-bench/src/verify.rs | 97 ++++++++++--- vortex-bench/src/conversions.rs | 25 ++++ 5 files changed, 210 insertions(+), 100 deletions(-) diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs index d5d5df1f1a2..f710e9ba8aa 100644 --- a/benchmarks/vector-search-bench/src/lib.rs +++ b/benchmarks/vector-search-bench/src/lib.rs @@ -41,11 +41,23 @@ use anyhow::Result; use anyhow::bail; use clap::ValueEnum; use vortex::array::ArrayRef; +use vortex::array::ExecutionCtx; use vortex::array::IntoArray; use vortex::array::VortexSessionExecute; use vortex::array::arrays::BoolArray; +use vortex::array::arrays::Chunked; +use vortex::array::arrays::ChunkedArray; +use vortex::array::arrays::Extension; +use vortex::array::arrays::ExtensionArray; use vortex::array::arrays::FixedSizeListArray; use vortex::array::arrays::PrimitiveArray; +use vortex::array::arrays::Struct; +use vortex::array::arrays::chunked::ChunkedArrayExt; +use vortex::array::arrays::extension::ExtensionArrayExt; +use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt; +use vortex::array::arrays::struct_::StructArrayExt as _; +use vortex::dtype::DType; +use vortex::dtype::PType; use vortex::session::VortexSession; use vortex_bench::Format; use vortex_bench::SESSION; @@ -54,6 +66,8 @@ use vortex_bench::conversions::parquet_to_vortex_chunks; use vortex_bench::datasets::Dataset; use vortex_bench::vector_dataset::VectorDataset; use vortex_btrblocks::BtrBlocksCompressor; +use vortex_tensor::scalar_fns::cosine_similarity::CosineSimilarity; +use vortex_tensor::vector_search::build_constant_query_vector; use vortex_tensor::vector_search::build_similarity_search_tree; use vortex_tensor::vector_search::compress_turboquant; @@ -137,16 +151,12 @@ impl PreparedDataset { /// and it guarantees this shape. pub fn dim(&self) -> u32 { let fsl_dtype = match self.uncompressed.dtype() { - vortex::dtype::DType::Extension(ext) => ext.storage_dtype(), - other => { - vortex::error::vortex_panic!("expected Extension, got {other}") - } + DType::Extension(ext) => ext.storage_dtype(), + other => vortex::error::vortex_panic!("expected Extension, got {other}"), }; match fsl_dtype { - vortex::dtype::DType::FixedSizeList(_, dim, _) => *dim, - other => { - vortex::error::vortex_panic!("expected FixedSizeList storage, got {other}") - } + DType::FixedSizeList(_, dim, _) => *dim, + other => vortex::error::vortex_panic!("expected FixedSizeList storage, got {other}"), } } @@ -159,8 +169,6 @@ impl PreparedDataset { /// Prepare a dataset by downloading its parquet file, converting the `emb` column to a /// `Vector` extension array, and extracting a single-row query vector. pub async fn prepare_dataset(dataset: &VectorDataset) -> Result { - use vortex::array::arrays::ExtensionArray; - let parquet_path = dataset .to_parquet_path() .await @@ -202,12 +210,6 @@ pub async fn prepare_dataset(dataset: &VectorDataset) -> Result /// Project the `emb` column out of a chunked struct array. This rebuilds a chunked list /// array with just that one column. fn extract_emb_column(struct_array: &ArrayRef) -> Result { - use vortex::array::arrays::Chunked; - use vortex::array::arrays::ChunkedArray; - use vortex::array::arrays::Struct; - use vortex::array::arrays::chunked::ChunkedArrayExt; - use vortex::array::arrays::struct_::StructArrayExt as _; - if let Some(chunked) = struct_array.as_opt::() { let mut emb_chunks: Vec = Vec::with_capacity(chunked.nchunks()); for chunk in chunked.iter_chunks() { @@ -238,11 +240,6 @@ fn extract_emb_column(struct_array: &ArrayRef) -> Result { /// restricts itself to `f32` vectors, so we assert the element type rather than /// quietly returning a mis-cast slice. pub(crate) fn extract_query_row(vector_ext: &ArrayRef, row: usize) -> Result> { - use vortex::array::arrays::Extension; - use vortex::array::arrays::extension::ExtensionArrayExt; - use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt; - use vortex::dtype::PType; - if row >= vector_ext.len() { bail!( "query row {row} out of bounds for dataset of length {}", @@ -260,7 +257,7 @@ pub(crate) fn extract_query_row(vector_ext: &ArrayRef, row: usize) -> Result *d as usize, + DType::FixedSizeList(_, d, _) => *d as usize, other => bail!("storage dtype must be FixedSizeList, got {other}"), }; @@ -437,12 +434,8 @@ pub struct VariantTimings { /// pipeline. pub fn decompress_full_scan( array: &ArrayRef, - ctx: &mut vortex::array::ExecutionCtx, + ctx: &mut ExecutionCtx, ) -> Result { - use vortex::array::arrays::ExtensionArray; - use vortex::array::arrays::extension::ExtensionArrayExt; - use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt; - let ext: ExtensionArray = array.clone().execute(ctx)?; let fsl: FixedSizeListArray = ext.storage_array().clone().execute(ctx)?; // Force the element buffer all the way down to a canonical PrimitiveArray so the @@ -465,11 +458,8 @@ pub fn decompress_full_scan( pub fn execute_cosine( data: &ArrayRef, query: &[f32], - ctx: &mut vortex::array::ExecutionCtx, + ctx: &mut ExecutionCtx, ) -> Result { - use vortex_tensor::scalar_fns::cosine_similarity::CosineSimilarity; - use vortex_tensor::vector_search::build_constant_query_vector; - let num_rows = data.len(); let query_vec = build_constant_query_vector(query, num_rows)?; let cosine = CosineSimilarity::try_new_array(data.clone(), query_vec, num_rows)?.into_array(); @@ -480,7 +470,7 @@ fn execute_filter( data: &ArrayRef, query: &[f32], threshold: f32, - ctx: &mut vortex::array::ExecutionCtx, + ctx: &mut ExecutionCtx, ) -> Result { let tree = build_similarity_search_tree(data.clone(), query, threshold)?; Ok(tree.execute(ctx)?) @@ -525,36 +515,62 @@ pub(crate) mod test_utils { #[cfg(test)] mod tests { - use vortex::array::arrays::FixedSizeListArray; - use vortex::array::arrays::PrimitiveArray; - use vortex::array::arrays::extension::ExtensionArrayExt; - use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt; use vortex_bench::SESSION; use super::test_utils::synthetic_vector; use super::*; + /// Build a test `PreparedDataset` from synthetic data, pulling the query from + /// row 0 via the shared `extract_query_row` helper so all tests exercise the + /// ptype-assertion path the benchmark hot path uses. + fn test_prepared(dim: u32, num_rows: usize, seed: u64) -> PreparedDataset { + let uncompressed = synthetic_vector(dim, num_rows, seed); + let query = extract_query_row(&uncompressed, 0).unwrap(); + PreparedDataset { + name: "synthetic".to_string(), + uncompressed, + query, + parquet_bytes: 0, + } + } + + #[test] + fn extract_query_row_returns_the_right_slice() { + let dim = 8u32; + let num_rows = 4usize; + let prepared = test_prepared(dim, num_rows, 0xDEADBEEF); + + // Row 0 extraction was already used to populate `prepared.query`; check it + // agrees with a second extraction for row 0, and that row 3 (last) is + // different (as it should be for distinct synthetic vectors). + let row0 = extract_query_row(&prepared.uncompressed, 0).unwrap(); + let row3 = extract_query_row(&prepared.uncompressed, 3).unwrap(); + assert_eq!(row0, prepared.query); + assert_eq!(row0.len(), dim as usize); + assert_eq!(row3.len(), dim as usize); + assert_ne!(row0, row3, "different rows must differ for this seed"); + } + + #[test] + fn extract_query_row_rejects_out_of_bounds_row() { + let dim = 8u32; + let num_rows = 4usize; + let prepared = test_prepared(dim, num_rows, 0xC0FFEE); + + let err = extract_query_row(&prepared.uncompressed, 4) + .unwrap_err() + .to_string(); + assert!( + err.contains("query row 4 out of bounds"), + "unexpected error: {err}" + ); + } + #[test] fn prepare_variant_produces_non_empty_array_for_all_variants() { let dim = 128u32; let num_rows = 64usize; - let uncompressed = synthetic_vector(dim, num_rows, 0xC0FFEE); - - let ext = uncompressed - .as_opt::() - .unwrap(); - let mut ctx = SESSION.create_execution_ctx(); - let fsl: FixedSizeListArray = ext.storage_array().clone().execute(&mut ctx).unwrap(); - let elements: PrimitiveArray = fsl.elements().clone().execute(&mut ctx).unwrap(); - let slice = elements.as_slice::(); - let query = slice[..dim as usize].to_vec(); - - let prepared = PreparedDataset { - name: "synthetic".to_string(), - uncompressed: uncompressed.clone(), - query, - parquet_bytes: 0, - }; + let prepared = test_prepared(dim, num_rows, 0xC0FFEE); for variant in [ Variant::VortexUncompressed, @@ -585,14 +601,7 @@ mod tests { fn uncompressed_decompress_is_fast() { let dim = 128u32; let num_rows = 256usize; - let uncompressed = synthetic_vector(dim, num_rows, 0xDEADBEEF); - - let prepared = PreparedDataset { - name: "synthetic".to_string(), - uncompressed, - query: vec![0.1f32; dim as usize], - parquet_bytes: 0, - }; + let prepared = test_prepared(dim, num_rows, 0xDEADBEEF); let uncompressed_prep = prepare_variant(&prepared, Variant::VortexUncompressed, &SESSION).unwrap(); @@ -628,14 +637,7 @@ mod tests { fn print_variant_trees() { let dim = 768u32; let num_rows = 500usize; - let uncompressed = synthetic_vector(dim, num_rows, 0xC0FFEE); - - let prepared = PreparedDataset { - name: "synthetic".to_string(), - uncompressed, - query: vec![0.1f32; dim as usize], - parquet_bytes: 0, - }; + let prepared = test_prepared(dim, num_rows, 0xC0FFEE); for variant in [ Variant::VortexUncompressed, diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs index 7cae73cf823..261d07becb3 100644 --- a/benchmarks/vector-search-bench/src/main.rs +++ b/benchmarks/vector-search-bench/src/main.rs @@ -179,9 +179,9 @@ async fn main() -> Result<()> { .filter_map(|f| f.into_variant()) .collect(); - // `args.formats.len()` counts both the handrolled baseline and the Vortex variants, - // so it matches the number of `progress.inc(1)` calls we'll make below (one per - // Vortex variant plus one per dataset for the handrolled path when it's enabled). + // One progress unit per inner-loop body: each Vortex variant plus the handrolled + // path (when it's enabled) gets exactly one `progress.inc(1)` below. Keep this + // count in sync with the number of `progress.inc` sites. let total_work = datasets.len() * (variants.len() + usize::from(run_handrolled_baseline)); let progress = ProgressBar::new(total_work as u64); diff --git a/benchmarks/vector-search-bench/src/recall.rs b/benchmarks/vector-search-bench/src/recall.rs index c091b74305e..e4fbd721f28 100644 --- a/benchmarks/vector-search-bench/src/recall.rs +++ b/benchmarks/vector-search-bench/src/recall.rs @@ -75,13 +75,15 @@ pub fn measure_recall_at_k( } /// Return the indices of the top-K highest scores, stable-sorted descending. +/// +/// Uses `f32::total_cmp` for a NaN-safe total order — `partial_cmp` would panic on +/// NaN, and `partial_cmp(...).unwrap_or(Ordering::Equal)` would put NaNs at +/// arbitrary positions. `total_cmp` gives NaNs a well-defined (but meaningless) sort +/// slot, which lets the function be robust against accidental NaN inputs without +/// silently hiding them. fn top_k_indices(scores: &[f32], top_k: usize) -> Vec { let mut idx: Vec = (0..scores.len()).collect(); - idx.sort_by(|&a, &b| { - scores[b] - .partial_cmp(&scores[a]) - .unwrap_or(std::cmp::Ordering::Equal) - }); + idx.sort_by(|&a, &b| scores[b].total_cmp(&scores[a])); idx.truncate(top_k); idx } @@ -92,9 +94,34 @@ mod tests { use super::*; use crate::Variant; + use crate::extract_query_row; use crate::prepare_variant; use crate::test_utils::synthetic_vector; + #[test] + fn top_k_indices_handles_nan_without_panicking() { + // `partial_cmp` panics on NaN (well, returns None, which was silently swallowed + // before). `total_cmp` gives NaN a well-defined slot, so the sort doesn't + // panic and doesn't produce arbitrary orderings for non-NaN elements. + let scores = [0.9f32, f32::NAN, 0.7, 0.5, f32::NAN]; + let top = top_k_indices(&scores, 3); + assert_eq!(top.len(), 3); + // The finite values 0.9, 0.7, 0.5 should still rank in the right order + // relative to each other — NaNs sort somewhere, but the finite ordering is + // preserved because `total_cmp` is a total order. + let finite_positions: Vec = top + .iter() + .copied() + .filter(|&i| !scores[i].is_nan()) + .collect(); + assert!( + finite_positions + .windows(2) + .all(|w| scores[w[0]] >= scores[w[1]]), + "finite scores should still be in descending order" + ); + } + #[test] fn uncompressed_has_perfect_self_recall() { let dim = 128u32; @@ -114,10 +141,13 @@ mod tests { let num_rows = 64usize; let uncompressed = synthetic_vector(dim, num_rows, 0xC0FFEE); + // `measure_recall_at_k` doesn't need the PreparedDataset's `query` field — + // it derives queries internally via `extract_query_row` on `uncompressed`. + // Construct just enough of a `PreparedDataset` to pass to `prepare_variant`. let prepared = crate::PreparedDataset { name: "synthetic".to_string(), uncompressed: uncompressed.clone(), - query: vec![], + query: extract_query_row(&uncompressed, 0).unwrap(), parquet_bytes: 0, }; diff --git a/benchmarks/vector-search-bench/src/verify.rs b/benchmarks/vector-search-bench/src/verify.rs index cbca8a3adc5..ca58ff44b13 100644 --- a/benchmarks/vector-search-bench/src/verify.rs +++ b/benchmarks/vector-search-bench/src/verify.rs @@ -24,6 +24,7 @@ use anyhow::Result; use anyhow::bail; use vortex::array::ArrayRef; use vortex::array::VortexSessionExecute; +use vortex::dtype::PType; use vortex::session::VortexSession; use crate::execute_cosine; @@ -78,9 +79,17 @@ impl VerificationReport { } /// Compute cosine-similarity scores for a single query row on `data` and return them -/// as a plain `Vec`. This is just a convenience wrapper around +/// as a plain `Vec`. This is a convenience wrapper around /// [`crate::execute_cosine`] that pulls the f32 slice out of the resulting /// `PrimitiveArray`. +/// +/// # Errors +/// +/// Returns an error if [`execute_cosine`] fails (bad input shape or dispatch error), +/// or if the cosine expression produces a non-`f32` primitive array. The latter can't +/// happen today because the benchmark only wires `f32` `Vector` columns, but the +/// explicit ptype check keeps the function sound if the scalar-fn output type ever +/// widens (e.g. to `f64`) without the caller noticing. pub fn compute_cosine_scores( data: &ArrayRef, query: &[f32], @@ -88,6 +97,12 @@ pub fn compute_cosine_scores( ) -> Result> { let mut ctx = session.create_execution_ctx(); let scores = execute_cosine(data, query, &mut ctx)?; + if scores.ptype() != PType::F32 { + bail!( + "compute_cosine_scores: cosine output must be f32, got {:?}", + scores.ptype() + ); + } Ok(scores.as_slice::().to_vec()) } @@ -207,35 +222,26 @@ mod tests { use super::*; use crate::Variant; + use crate::extract_query_row; use crate::prepare_variant; use crate::test_utils::synthetic_vector; + /// Build a `PreparedDataset` whose `query` is row 0 of the dataset. Using + /// `extract_query_row` here (rather than a test-local f32 extraction helper) also + /// keeps the test surface covered by the same ptype-assertion path the benchmark + /// hot path uses. fn make_prepared(dim: u32, num_rows: usize, seed: u64) -> crate::PreparedDataset { let uncompressed = synthetic_vector(dim, num_rows, seed); + let query = extract_query_row(&uncompressed, 0).unwrap(); + assert_eq!(query.len(), dim as usize); crate::PreparedDataset { name: "synthetic".to_string(), uncompressed, - // Filled in below from row 0. - query: vec![], + query, parquet_bytes: 0, } } - fn extract_row_zero(uncompressed: &ArrayRef, dim: u32) -> Vec { - use vortex::array::VortexSessionExecute; - use vortex::array::arrays::Extension; - use vortex::array::arrays::FixedSizeListArray; - use vortex::array::arrays::PrimitiveArray; - use vortex::array::arrays::extension::ExtensionArrayExt; - use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt; - - let mut ctx = SESSION.create_execution_ctx(); - let ext = uncompressed.as_opt::().unwrap(); - let fsl: FixedSizeListArray = ext.storage_array().clone().execute(&mut ctx).unwrap(); - let elements: PrimitiveArray = fsl.elements().clone().execute(&mut ctx).unwrap(); - elements.as_slice::()[..dim as usize].to_vec() - } - #[test] fn compare_scores_handles_empty() { let (mean, max) = compare_scores(&[], &[]); @@ -305,12 +311,60 @@ mod tests { assert!(report.max_abs_diff.is_infinite()); } + #[test] + fn verify_scores_fails_on_nan_in_baseline() { + // Symmetric case: NaN on the baseline side should also fail, not just variant. + let base = [0.5f32, f32::NAN]; + let other = [0.5f32, 0.5]; + let report = verify_scores(&base, &other, VerificationKind::Lossless); + assert!(!report.passed); + assert!(report.max_abs_diff.is_infinite()); + } + + #[test] + fn verify_and_report_scores_is_ok_for_identical_inputs() { + let base = [0.5f32; 10]; + let report = + verify_and_report_scores("self", &base, &base, VerificationKind::Lossless).unwrap(); + assert!(report.passed); + assert_eq!(report.max_abs_diff, 0.0); + } + + #[test] + fn verify_and_report_scores_bails_for_lossless_mismatch() { + let base = [0.5f32; 10]; + let mut other = [0.5f32; 10]; + other[3] = 0.6; + let err = + verify_and_report_scores("broken-variant", &other, &base, VerificationKind::Lossless) + .unwrap_err() + .to_string(); + assert!( + err.contains("broken-variant correctness check failed"), + "unexpected error: {err}" + ); + } + + #[test] + fn verify_and_report_scores_warns_for_lossy_mismatch_without_bailing() { + // A lossy variant outside its tolerance should NOT bail — it logs a warning + // and returns the failing report so the caller can still emit the + // measurement and show recall alongside it. + let base = [0.9f32; 10]; + let mut other = [0.9f32; 10]; + other[0] = 1.5; // diff of 0.6, above the 0.2 lossy tolerance + let report = + verify_and_report_scores("too-lossy-variant", &other, &base, VerificationKind::Lossy) + .expect("lossy failures should not bail"); + assert!(!report.passed); + assert!(report.max_abs_diff > f64::from(LOSSY_TOLERANCE)); + } + #[test] fn vortex_default_matches_uncompressed_end_to_end() { let dim = 128u32; let num_rows = 64usize; - let mut prepared = make_prepared(dim, num_rows, 0xC0FFEE); - prepared.query = extract_row_zero(&prepared.uncompressed, dim); + let prepared = make_prepared(dim, num_rows, 0xC0FFEE); let baseline_scores = compute_cosine_scores(&prepared.uncompressed, &prepared.query, &SESSION).unwrap(); @@ -332,8 +386,7 @@ mod tests { fn vortex_turboquant_stays_within_lossy_tolerance() { let dim = 128u32; let num_rows = 64usize; - let mut prepared = make_prepared(dim, num_rows, 0xDEADBEEF); - prepared.query = extract_row_zero(&prepared.uncompressed, dim); + let prepared = make_prepared(dim, num_rows, 0xDEADBEEF); let baseline_scores = compute_cosine_scores(&prepared.uncompressed, &prepared.query, &SESSION).unwrap(); diff --git a/vortex-bench/src/conversions.rs b/vortex-bench/src/conversions.rs index 8793ed23b9a..70330e77f91 100644 --- a/vortex-bench/src/conversions.rs +++ b/vortex-bench/src/conversions.rs @@ -502,6 +502,31 @@ mod tests { )); } + #[test] + fn all_invalid_list_validity_is_rejected() { + // A list with `Validity::AllInvalid` means every row is null. The Vector + // extension type requires non-nullable elements at the FSL level, so we + // must reject this input rather than silently dropping the validity mask. + let elements = PrimitiveArray::new::( + BufferMut::::from_iter([1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0]).freeze(), + Validity::NonNullable, + ) + .into_array(); + let offsets = PrimitiveArray::new::( + BufferMut::::from_iter([0i32, 3, 6]).freeze(), + Validity::NonNullable, + ) + .into_array(); + let list = + vortex::array::Array::::new(elements, offsets, Validity::AllInvalid).into_array(); + + let err = list_to_vector_ext(list).unwrap_err().to_string(); + assert!( + err.contains("list rows must be non-nullable"), + "unexpected error: {err}" + ); + } + #[test] fn non_float_element_type_is_rejected() { // Build a List. From 87ac1c683f948c736cb20b22443693544ce8391a Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 11 Apr 2026 16:20:24 +0000 Subject: [PATCH 18/18] =?UTF-8?q?vector-search-bench:=20third=20review=20p?= =?UTF-8?q?ass=20=E2=80=94=20drop=20session,=20dedupe=20parquet=20reads?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three targeted fixes from the third review pass. ## 1. Drop vestigial `session: &VortexSession` parameters The benchmark always passed `&vortex_bench::SESSION` to every function that took a session parameter, so the parameter was plumbing with zero value. More importantly it was *inconsistent* — some internal helpers (like `extract_query_row`) already hardcoded the global, so the API suggested a custom session was respected end-to-end when it wasn't. Removed the `session` parameter from: - `lib::prepare_variant(prepared, variant)` - `lib::run_timings(variant_array, query, iterations)` - `verify::compute_cosine_scores(data, query)` - `verify::verify_variant(name, array, query, baseline, kind)` - `recall::measure_recall_at_k(uncompressed, compressed, num_queries, top_k)` Each function now calls `vortex_bench::SESSION.create_execution_ctx()` directly where needed. Doc comments note the "uses the global SESSION" contract explicitly. `execute_cosine`, `decompress_full_scan`, and `execute_filter` still take `&mut ExecutionCtx` because they're the low-level primitives that `run_timings` constructs fresh contexts for on each iteration. All call sites in `main.rs` and the test modules updated. ## 2. Fix the handrolled double-parquet-read `main.rs` previously did: 1. Read parquet → run cosine → verify against baseline → bail on fail 2. Call `run_handrolled_baseline_timings`, which reads parquet N more times inside its own loop Total: N+1 parquet reads per dataset per run. The extra read was wasted work — `cosine_loop` is deterministic, so the scores from iteration N of the timing loop are bit-identical to a one-shot pre-timing computation. Restructured `run_handrolled_baseline_timings` to return a new `HandrolledBaselineResult { timings, last_scores }` where `last_scores` is the `Vec` produced by the final iteration. `main.rs` now: 1. Call `run_handrolled_baseline_timings` → gets timings + last_scores 2. Call `verify_and_report_scores` with those last_scores vs baseline → bails on lossless mismatch Total: N parquet reads per dataset per run. One less read, and the verification now uses the exact same scores the timing loop computed rather than a separate pass. Order reversal note: verification now runs *after* timing instead of before. The handrolled loop is cheap enough (~5 ms cosine + ~40 ms parquet read per iter) that running it once even for a broken variant is acceptable, and the correctness bail still prevents the run from reporting wrong-answer numbers. `run_handrolled_baseline_timings` now asserts `iterations >= 1` up front (previously relied on `last_scores` being populated after the loop). ## 3. Nullable element type test `list_to_vector_ext` has a rejection path for `List` — element dtype is a nullable f32 (even when every value is present). Added `conversions::tests::nullable_element_dtype_is_rejected` to exercise this path. Passing `Validity::AllValid` to `PrimitiveArray::new::` produces the nullable dtype the rejection path is looking for. ## Also: housekeeping - Dropped `use vortex_bench::SESSION` from the `lib.rs` inner test module (was left over from the session-parameter removal). - Replaced two fully-qualified `vortex::error::vortex_panic!` calls in `PreparedDataset::dim()` with a plain `vortex_panic!` sourced from a top-level import. - Refreshed the doc comment on `PreparedDataset` — it previously described the struct as carrying an "execution session / context", which is no longer accurate. - Added two new tests for `run_handrolled_baseline_timings`: - `run_handrolled_baseline_timings_returns_last_iteration_scores`: verifies the `last_scores` contract on a 3-row fixture. - `run_handrolled_baseline_timings_panics_on_zero_iterations`: regression guard for the iterations>=1 assert. ## Test counts - vortex-bench: 19 tests (was 18, +1 for nullable_element_dtype) - vector-search-bench: 22 passing + 1 ignored (was 20+1, +2 for the handrolled timing tests) - vortex-tensor: 222 tests (unchanged) - Total: 263 passing, 3 ignored clippy / rustfmt clean. End-to-end smoke run still coherent: - correctness-max-diff = 0.000e0 for all three lossless variants - correctness-max-diff = 5.18e-3 for TurboQuant (within 0.2 tolerance) - Recall@10 for TurboQuant = 0.9267 Signed-off-by: Claude Signed-off-by: Connor Tsui --- .../src/handrolled_baseline.rs | 86 +++++++++++++++++-- benchmarks/vector-search-bench/src/lib.rs | 51 +++++------ benchmarks/vector-search-bench/src/main.rs | 56 +++++------- benchmarks/vector-search-bench/src/recall.rs | 17 ++-- benchmarks/vector-search-bench/src/verify.rs | 29 +++---- vortex-bench/src/conversions.rs | 29 +++++++ 6 files changed, 168 insertions(+), 100 deletions(-) diff --git a/benchmarks/vector-search-bench/src/handrolled_baseline.rs b/benchmarks/vector-search-bench/src/handrolled_baseline.rs index bf91915908f..f62c5198fb6 100644 --- a/benchmarks/vector-search-bench/src/handrolled_baseline.rs +++ b/benchmarks/vector-search-bench/src/handrolled_baseline.rs @@ -187,20 +187,51 @@ pub struct HandrolledBaselineData { pub num_rows: usize, } +/// Result of running the hand-rolled baseline timing loop. +/// +/// Carries both the best-of-N timing numbers **and** the cosine scores from the final +/// iteration. The scores are exposed so the caller can feed them into +/// [`crate::verify::verify_and_report_scores`] for the correctness check without +/// re-reading the parquet file. Because `cosine_loop` is deterministic, the scores +/// from any iteration equal the scores from every other iteration; using the last +/// one is simply the most convenient snapshot. +pub struct HandrolledBaselineResult { + /// Best-of-N wall times for decompress / cosine / filter. + pub timings: VariantTimings, + /// Cosine-similarity scores from the final iteration. Length equals the dataset + /// row count. + pub last_scores: Vec, +} + /// Run the decompress / cosine / filter microbenchmarks for the hand-rolled baseline -/// and return the best-of-N wall times. The decompress phase re-reads the parquet file -/// from disk on each iteration (matches how the Vortex variants re-execute their tree -/// from scratch each iteration), and the compute phase runs [`cosine_loop`] and -/// [`filter_loop`] over the flat `Vec` the decompress phase produced. +/// and return the best-of-N wall times along with the last iteration's cosine scores. +/// +/// The decompress phase re-reads the parquet file from disk on each iteration (matches +/// how the Vortex variants re-execute their tree from scratch each iteration), and the +/// compute phase runs [`cosine_loop`] and [`filter_loop`] over the flat `Vec` the +/// decompress phase produced. Returning the last iteration's scores lets the caller +/// perform correctness verification against the Vortex baseline without a redundant +/// parquet read. +/// +/// # Panics +/// +/// Panics if `iterations == 0`. The benchmark CLI defaults to 5 and the lowest +/// meaningful value is 1 (single-shot best-of-1). pub fn run_handrolled_baseline_timings( parquet_path: &Path, query: &[f32], threshold: f32, iterations: usize, -) -> Result { +) -> Result { + assert!( + iterations > 0, + "run_handrolled_baseline_timings requires iterations >= 1" + ); + let mut decompress = Duration::MAX; let mut cosine = Duration::MAX; let mut filter = Duration::MAX; + let mut last_scores: Vec = Vec::new(); for _ in 0..iterations { let start = Instant::now(); @@ -216,12 +247,17 @@ pub fn run_handrolled_baseline_timings( let matches = filter_loop(&scores, threshold); filter = filter.min(start.elapsed()); debug_assert_eq!(matches.len(), data.num_rows); + + last_scores = scores; } - Ok(VariantTimings { - decompress, - cosine, - filter, + Ok(HandrolledBaselineResult { + timings: VariantTimings { + decompress, + cosine, + filter, + }, + last_scores, }) } @@ -330,4 +366,36 @@ mod tests { let mask = filter_loop(&scores, 0.5); assert_eq!(mask, vec![true, false, true]); } + + #[test] + fn run_handrolled_baseline_timings_returns_last_iteration_scores() { + // Verifies the new `last_scores` contract: the timing loop returns the + // cosine scores from the final iteration, and those scores match what we'd + // get from a one-shot `cosine_loop` on the same data. Callers of + // `run_handrolled_baseline_timings` rely on this for verification (so they + // don't need a second parquet read to compute ground-truth scores). + let file = + write_tiny_fsl_parquet(3, &[&[1.0, 0.0, 0.0], &[0.0, 1.0, 0.0], &[1.0, 0.0, 0.0]]) + .unwrap(); + let query = [1.0f32, 0.0, 0.0]; + + let result = run_handrolled_baseline_timings(file.path(), &query, 0.5, 3).unwrap(); + + // Deterministic expected scores: rows 0 and 2 match the query exactly, + // row 1 is orthogonal. + assert_eq!(result.last_scores, vec![1.0, 0.0, 1.0]); + assert!(result.timings.decompress > Duration::ZERO); + assert!(result.timings.cosine > Duration::ZERO); + assert!(result.timings.filter > Duration::ZERO); + } + + #[test] + #[should_panic(expected = "iterations >= 1")] + fn run_handrolled_baseline_timings_panics_on_zero_iterations() { + let file = + write_tiny_fsl_parquet(3, &[&[1.0, 0.0, 0.0], &[0.0, 1.0, 0.0], &[1.0, 0.0, 0.0]]) + .unwrap(); + let query = [1.0f32, 0.0, 0.0]; + let _result = run_handrolled_baseline_timings(file.path(), &query, 0.5, 0); + } } diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs index f710e9ba8aa..0a70fdfed1e 100644 --- a/benchmarks/vector-search-bench/src/lib.rs +++ b/benchmarks/vector-search-bench/src/lib.rs @@ -58,7 +58,7 @@ use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt; use vortex::array::arrays::struct_::StructArrayExt as _; use vortex::dtype::DType; use vortex::dtype::PType; -use vortex::session::VortexSession; +use vortex::error::vortex_panic; use vortex_bench::Format; use vortex_bench::SESSION; use vortex_bench::conversions::list_to_vector_ext; @@ -126,7 +126,8 @@ impl Variant { } } -/// A materialized Vortex array and its associated execution session / context. +/// The ingested form of a dataset, ready to be fed to [`prepare_variant`] and the +/// timing/verification pipeline. pub struct PreparedDataset { /// Name used in metric strings — usually the dataset's `Dataset::name()`. pub name: String, @@ -152,11 +153,11 @@ impl PreparedDataset { pub fn dim(&self) -> u32 { let fsl_dtype = match self.uncompressed.dtype() { DType::Extension(ext) => ext.storage_dtype(), - other => vortex::error::vortex_panic!("expected Extension, got {other}"), + other => vortex_panic!("expected Extension, got {other}"), }; match fsl_dtype { DType::FixedSizeList(_, dim, _) => *dim, - other => vortex::error::vortex_panic!("expected FixedSizeList storage, got {other}"), + other => vortex_panic!("expected FixedSizeList storage, got {other}"), } } @@ -292,7 +293,8 @@ pub struct PreparedVariant { /// Apply a `Variant`'s preparation strategy to the materialized uncompressed source and /// return the resulting tree together with its reported in-memory size and construction -/// time. +/// time. Uses the global [`vortex_bench::SESSION`] for any execution-context work; the +/// benchmark has no reason to support multiple concurrent sessions. /// /// **Why nbytes instead of on-disk size?** The Vortex file writer applies BtrBlocks /// compression as part of its default write strategy regardless of the in-memory tree @@ -301,11 +303,7 @@ pub struct PreparedVariant { /// compressed tree — the disk-size comparison collapses two conceptually different /// things into one number. Reporting `nbytes()` of the in-memory tree keeps the size /// measurement consistent with what the *compute* measurements operate on. -pub fn prepare_variant( - prepared: &PreparedDataset, - variant: Variant, - session: &VortexSession, -) -> Result { +pub fn prepare_variant(prepared: &PreparedDataset, variant: Variant) -> Result { match variant { Variant::VortexUncompressed => { // Identity: the uncompressed Extension is already materialized. Still @@ -333,7 +331,7 @@ pub fn prepare_variant( }) } Variant::VortexTurboQuant => { - let mut ctx = session.create_execution_ctx(); + let mut ctx = SESSION.create_execution_ctx(); let start = Instant::now(); let array = compress_turboquant(prepared.uncompressed.clone(), &mut ctx)?; let compress_duration = start.elapsed(); @@ -359,13 +357,13 @@ pub fn prepare_variant( /// interleaved form makes each stage see roughly the same cache state every /// iteration. /// -/// Each stage still gets a fresh `ExecutionCtx`, so no cached scalar-fn state leaks -/// between stages within a single iteration. +/// Each stage still gets a fresh `ExecutionCtx` (from the global +/// [`vortex_bench::SESSION`]), so no cached scalar-fn state leaks between stages +/// within a single iteration. pub fn run_timings( variant_array: &ArrayRef, query: &[f32], iterations: usize, - session: &VortexSession, ) -> Result { let mut decompress = Duration::MAX; let mut cosine = Duration::MAX; @@ -373,21 +371,21 @@ pub fn run_timings( for _ in 0..iterations { { - let mut ctx = session.create_execution_ctx(); + let mut ctx = SESSION.create_execution_ctx(); let start = Instant::now(); let decoded: FixedSizeListArray = decompress_full_scan(variant_array, &mut ctx)?; decompress = decompress.min(start.elapsed()); drop(decoded); } { - let mut ctx = session.create_execution_ctx(); + let mut ctx = SESSION.create_execution_ctx(); let start = Instant::now(); let scores: PrimitiveArray = execute_cosine(variant_array, query, &mut ctx)?; cosine = cosine.min(start.elapsed()); drop(scores); } { - let mut ctx = session.create_execution_ctx(); + let mut ctx = SESSION.create_execution_ctx(); let start = Instant::now(); let matches: BoolArray = execute_filter(variant_array, query, DEFAULT_THRESHOLD, &mut ctx)?; @@ -515,8 +513,6 @@ pub(crate) mod test_utils { #[cfg(test)] mod tests { - use vortex_bench::SESSION; - use super::test_utils::synthetic_vector; use super::*; @@ -577,7 +573,7 @@ mod tests { Variant::VortexDefault, Variant::VortexTurboQuant, ] { - let prep = prepare_variant(&prepared, variant, &SESSION).unwrap(); + let prep = prepare_variant(&prepared, variant).unwrap(); assert_eq!( prep.array.len(), num_rows, @@ -585,7 +581,7 @@ mod tests { ); assert!(prep.nbytes > 0, "variant {variant:?} reported zero size"); - let timings = run_timings(&prep.array, &prepared.query, 2, &SESSION).unwrap(); + let timings = run_timings(&prep.array, &prepared.query, 2).unwrap(); // TurboQuant + default must do real work; uncompressed's decompress is a // no-op and can plausibly time as zero. assert!(timings.cosine > Duration::ZERO); @@ -603,14 +599,11 @@ mod tests { let num_rows = 256usize; let prepared = test_prepared(dim, num_rows, 0xDEADBEEF); - let uncompressed_prep = - prepare_variant(&prepared, Variant::VortexUncompressed, &SESSION).unwrap(); - let turboquant_prep = - prepare_variant(&prepared, Variant::VortexTurboQuant, &SESSION).unwrap(); + let uncompressed_prep = prepare_variant(&prepared, Variant::VortexUncompressed).unwrap(); + let turboquant_prep = prepare_variant(&prepared, Variant::VortexTurboQuant).unwrap(); - let unc_timings = - run_timings(&uncompressed_prep.array, &prepared.query, 3, &SESSION).unwrap(); - let tq_timings = run_timings(&turboquant_prep.array, &prepared.query, 3, &SESSION).unwrap(); + let unc_timings = run_timings(&uncompressed_prep.array, &prepared.query, 3).unwrap(); + let tq_timings = run_timings(&turboquant_prep.array, &prepared.query, 3).unwrap(); // The uncompressed decompress should be at least an order of magnitude faster // than TurboQuant's (usually many orders of magnitude). 5x is a loose lower @@ -644,7 +637,7 @@ mod tests { Variant::VortexDefault, Variant::VortexTurboQuant, ] { - let prep = prepare_variant(&prepared, variant, &SESSION).unwrap(); + let prep = prepare_variant(&prepared, variant).unwrap(); println!("=== {variant:?} ==="); println!(" len : {}", prep.array.len()); println!(" nbytes : {}", prep.nbytes); diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs index 261d07becb3..5e78563ef22 100644 --- a/benchmarks/vector-search-bench/src/main.rs +++ b/benchmarks/vector-search-bench/src/main.rs @@ -29,8 +29,6 @@ use clap::Parser; use indicatif::ProgressBar; use vector_search_bench::DEFAULT_THRESHOLD; use vector_search_bench::Variant; -use vector_search_bench::handrolled_baseline::cosine_loop; -use vector_search_bench::handrolled_baseline::read_parquet_embedding_column; use vector_search_bench::handrolled_baseline::run_handrolled_baseline_timings; use vector_search_bench::prepare_dataset; use vector_search_bench::prepare_variant; @@ -42,7 +40,6 @@ use vector_search_bench::verify::compute_cosine_scores; use vector_search_bench::verify::verify_and_report_scores; use vector_search_bench::verify::verify_variant; use vortex_bench::Format; -use vortex_bench::SESSION; use vortex_bench::create_output_writer; use vortex_bench::datasets::Dataset; use vortex_bench::display::DisplayFormat; @@ -202,9 +199,8 @@ async fn main() -> Result<()> { // Ground-truth cosine scores for the verification query — the scores produced by // the uncompressed Vortex scan. Every other variant (including the hand-rolled // baseline) will be compared against this. - let baseline_scores = - compute_cosine_scores(&prepared.uncompressed, &prepared.query, &SESSION) - .context("compute ground-truth cosine scores for verification")?; + let baseline_scores = compute_cosine_scores(&prepared.uncompressed, &prepared.query) + .context("compute ground-truth cosine scores for verification")?; tracing::info!( "computed {} ground-truth cosine scores for {}", baseline_scores.len(), @@ -222,26 +218,30 @@ async fn main() -> Result<()> { // parquet on disk — only the *compute* is hand-rolled. The metric `name` field // carries the `handrolled` label so human readers can tell the compute apart // from, say, a DuckDB `list_cosine_similarity` baseline on the same parquet. + // + // Timing runs first and returns the cosine scores from its final iteration; + // verification then reuses those scores rather than re-reading the parquet + // file. `cosine_loop` is deterministic, so the last-iteration scores equal + // what a separate pre-timing verification pass would produce — we just save + // one parquet read per dataset. If the scores drift from the Vortex baseline, + // `verify_and_report_scores` bails here (after the timing already ran, which + // is acceptable because the handrolled loop is cheap and we'd rather run it + // twice than skip correctness). if run_handrolled_baseline { let parquet_path = dataset.to_parquet_path().await?; let label = "handrolled"; let bench_name = format!("{label}/{}", prepared.name); - // Verify the handrolled cosine scores against the Vortex baseline before - // any timing starts. `verify_and_report_scores` is the same helper the - // Vortex-variant loop ends up calling through `verify_variant`, so the - // two paths share all their pass/fail / log / bail logic. - let baseline_data = read_parquet_embedding_column(&parquet_path) - .context("read parquet emb column for verification")?; - let handrolled_scores = cosine_loop( - &baseline_data.elements, - baseline_data.num_rows, - baseline_data.dim, + let baseline_result = run_handrolled_baseline_timings( + &parquet_path, &prepared.query, - ); + DEFAULT_THRESHOLD, + args.iterations, + )?; + let handrolled_report = verify_and_report_scores( &bench_name, - &handrolled_scores, + &baseline_result.last_scores, &baseline_scores, VerificationKind::Lossless, )?; @@ -258,13 +258,6 @@ async fn main() -> Result<()> { value: handrolled_report.max_abs_diff, }); - let baseline_timings = run_handrolled_baseline_timings( - &parquet_path, - &prepared.query, - DEFAULT_THRESHOLD, - args.iterations, - )?; - sizes.push(CustomUnitMeasurement { name: format!("{label} size/{}", prepared.name), format: Format::Parquet, @@ -274,24 +267,24 @@ async fn main() -> Result<()> { timings.push(CompressionTimingMeasurement { name: format!("decompress time/{bench_name}"), format: Format::Parquet, - time: baseline_timings.decompress, + time: baseline_result.timings.decompress, }); timings.push(CompressionTimingMeasurement { name: format!("cosine-similarity time/{bench_name}"), format: Format::Parquet, - time: baseline_timings.cosine, + time: baseline_result.timings.cosine, }); timings.push(CompressionTimingMeasurement { name: format!("cosine-filter time/{bench_name}"), format: Format::Parquet, - time: baseline_timings.filter, + time: baseline_result.timings.filter, }); progress.inc(1); } for &variant in &variants { - let prep = prepare_variant(&prepared, variant, &SESSION)?; + let prep = prepare_variant(&prepared, variant)?; let variant_label = variant.label(); let bench_name = format!("{variant_label}/{}", prepared.name); @@ -311,7 +304,6 @@ async fn main() -> Result<()> { &prepared.query, &baseline_scores, kind, - &SESSION, )?; tracing::info!( "{} verification ({:?}): max_abs_diff={:.2e}, mean_abs_diff={:.2e}", @@ -345,8 +337,7 @@ async fn main() -> Result<()> { time: prep.compress_duration, }); - let variant_timings = - run_timings(&prep.array, &prepared.query, args.iterations, &SESSION)?; + let variant_timings = run_timings(&prep.array, &prepared.query, args.iterations)?; timings.push(CompressionTimingMeasurement { name: format!("decompress time/{bench_name}"), @@ -373,7 +364,6 @@ async fn main() -> Result<()> { &prep.array, args.recall_queries, args.recall_k, - &SESSION, )?; tracing::info!("Recall@{} for {}: {:.4}", args.recall_k, bench_name, recall); recalls.push(CustomUnitMeasurement { diff --git a/benchmarks/vector-search-bench/src/recall.rs b/benchmarks/vector-search-bench/src/recall.rs index e4fbd721f28..fb1dff4dc8c 100644 --- a/benchmarks/vector-search-bench/src/recall.rs +++ b/benchmarks/vector-search-bench/src/recall.rs @@ -16,7 +16,6 @@ use anyhow::Result; use vortex::array::ArrayRef; -use vortex::session::VortexSession; use vortex::utils::aliases::hash_set::HashSet; use crate::extract_query_row; @@ -26,7 +25,8 @@ use crate::verify::compute_cosine_scores; pub const DEFAULT_TOP_K: usize = 10; /// Compute recall@K for the lossy `compressed` variant against the `uncompressed` -/// ground-truth variant, averaged over `num_queries` sampled query rows. +/// ground-truth variant, averaged over `num_queries` sampled query rows. Uses the +/// global [`vortex_bench::SESSION`] for all executions. /// /// Query selection is deterministic: rows are picked uniformly across the dataset at /// `step = uncompressed.len() / num_queries` intervals. This keeps the result stable @@ -36,7 +36,6 @@ pub fn measure_recall_at_k( compressed: &ArrayRef, num_queries: usize, top_k: usize, - session: &VortexSession, ) -> Result { assert!( num_queries > 0, @@ -60,10 +59,10 @@ pub fn measure_recall_at_k( let row = (q * step).min(num_rows - 1); let query = extract_query_row(uncompressed, row)?; - let gt_scores = compute_cosine_scores(uncompressed, &query, session)?; + let gt_scores = compute_cosine_scores(uncompressed, &query)?; let truth = top_k_indices(>_scores, top_k); - let lossy_scores = compute_cosine_scores(compressed, &query, session)?; + let lossy_scores = compute_cosine_scores(compressed, &query)?; let lossy = top_k_indices(&lossy_scores, top_k); let truth_set: HashSet = truth.iter().copied().collect(); @@ -90,8 +89,6 @@ fn top_k_indices(scores: &[f32], top_k: usize) -> Vec { #[cfg(test)] mod tests { - use vortex_bench::SESSION; - use super::*; use crate::Variant; use crate::extract_query_row; @@ -128,7 +125,7 @@ mod tests { let num_rows = 64usize; let uncompressed = synthetic_vector(dim, num_rows, 0xC0FFEE); - let recall = measure_recall_at_k(&uncompressed, &uncompressed, 4, 10, &SESSION).unwrap(); + let recall = measure_recall_at_k(&uncompressed, &uncompressed, 4, 10).unwrap(); assert!( (recall - 1.0).abs() < 1e-9, "self-recall must be 1.0, got {recall}" @@ -151,12 +148,12 @@ mod tests { parquet_bytes: 0, }; - let tq_prep = prepare_variant(&prepared, Variant::VortexTurboQuant, &SESSION).unwrap(); + let tq_prep = prepare_variant(&prepared, Variant::VortexTurboQuant).unwrap(); // With only 64 random rows, recall@10 won't be 1.0 but it should be well // above chance (10/64 ≈ 0.156). The test asserts a loose lower bound to catch // total regressions without being flaky on distribution noise. - let recall = measure_recall_at_k(&uncompressed, &tq_prep.array, 4, 10, &SESSION).unwrap(); + let recall = measure_recall_at_k(&uncompressed, &tq_prep.array, 4, 10).unwrap(); assert!( recall >= 0.3, "TurboQuant recall@10 on 64×128 synthetic data should be ≥0.3, got {recall}", diff --git a/benchmarks/vector-search-bench/src/verify.rs b/benchmarks/vector-search-bench/src/verify.rs index ca58ff44b13..da2a73f1ea2 100644 --- a/benchmarks/vector-search-bench/src/verify.rs +++ b/benchmarks/vector-search-bench/src/verify.rs @@ -25,7 +25,7 @@ use anyhow::bail; use vortex::array::ArrayRef; use vortex::array::VortexSessionExecute; use vortex::dtype::PType; -use vortex::session::VortexSession; +use vortex_bench::SESSION; use crate::execute_cosine; @@ -81,7 +81,7 @@ impl VerificationReport { /// Compute cosine-similarity scores for a single query row on `data` and return them /// as a plain `Vec`. This is a convenience wrapper around /// [`crate::execute_cosine`] that pulls the f32 slice out of the resulting -/// `PrimitiveArray`. +/// `PrimitiveArray`. Uses the global [`vortex_bench::SESSION`]. /// /// # Errors /// @@ -90,12 +90,8 @@ impl VerificationReport { /// happen today because the benchmark only wires `f32` `Vector` columns, but the /// explicit ptype check keeps the function sound if the scalar-fn output type ever /// widens (e.g. to `f64`) without the caller noticing. -pub fn compute_cosine_scores( - data: &ArrayRef, - query: &[f32], - session: &VortexSession, -) -> Result> { - let mut ctx = session.create_execution_ctx(); +pub fn compute_cosine_scores(data: &ArrayRef, query: &[f32]) -> Result> { + let mut ctx = SESSION.create_execution_ctx(); let scores = execute_cosine(data, query, &mut ctx)?; if scores.ptype() != PType::F32 { bail!( @@ -203,23 +199,20 @@ pub fn verify_and_report_scores( /// same query used for the baseline and returns a [`VerificationReport`]. Returns /// `Err` if `kind` is [`VerificationKind::Lossless`] and the scores disagree beyond /// [`LOSSLESS_TOLERANCE`] — that indicates a real correctness bug, not a quality -/// tradeoff. +/// tradeoff. Uses the global [`vortex_bench::SESSION`]. pub fn verify_variant( variant_name: &str, variant_array: &ArrayRef, query: &[f32], baseline_scores: &[f32], kind: VerificationKind, - session: &VortexSession, ) -> Result { - let scores = compute_cosine_scores(variant_array, query, session)?; + let scores = compute_cosine_scores(variant_array, query)?; verify_and_report_scores(variant_name, &scores, baseline_scores, kind) } #[cfg(test)] mod tests { - use vortex_bench::SESSION; - use super::*; use crate::Variant; use crate::extract_query_row; @@ -367,16 +360,15 @@ mod tests { let prepared = make_prepared(dim, num_rows, 0xC0FFEE); let baseline_scores = - compute_cosine_scores(&prepared.uncompressed, &prepared.query, &SESSION).unwrap(); + compute_cosine_scores(&prepared.uncompressed, &prepared.query).unwrap(); - let default_prep = prepare_variant(&prepared, Variant::VortexDefault, &SESSION).unwrap(); + let default_prep = prepare_variant(&prepared, Variant::VortexDefault).unwrap(); let report = verify_variant( "vortex-default", &default_prep.array, &prepared.query, &baseline_scores, VerificationKind::Lossless, - &SESSION, ) .expect("vortex-default must be lossless against the uncompressed baseline"); assert!(report.passed); @@ -389,16 +381,15 @@ mod tests { let prepared = make_prepared(dim, num_rows, 0xDEADBEEF); let baseline_scores = - compute_cosine_scores(&prepared.uncompressed, &prepared.query, &SESSION).unwrap(); + compute_cosine_scores(&prepared.uncompressed, &prepared.query).unwrap(); - let tq_prep = prepare_variant(&prepared, Variant::VortexTurboQuant, &SESSION).unwrap(); + let tq_prep = prepare_variant(&prepared, Variant::VortexTurboQuant).unwrap(); let report = verify_variant( "vortex-turboquant", &tq_prep.array, &prepared.query, &baseline_scores, VerificationKind::Lossy, - &SESSION, ) .expect("TurboQuant verification should not error"); assert!( diff --git a/vortex-bench/src/conversions.rs b/vortex-bench/src/conversions.rs index 70330e77f91..9811abc903d 100644 --- a/vortex-bench/src/conversions.rs +++ b/vortex-bench/src/conversions.rs @@ -549,4 +549,33 @@ mod tests { "unexpected error: {err}", ); } + + #[test] + fn nullable_element_dtype_is_rejected() { + // Build a `List` — a list whose elements have nullable dtype (even + // if every value happens to be present). The `Vector` extension type at the + // FSL level requires non-nullable elements, so this must be rejected. + // + // Passing `Validity::AllValid` to `PrimitiveArray::new` sets the ptype's + // nullability to `Nullable`, which is what triggers the rejection path even + // though every value is technically valid. + let elements = PrimitiveArray::new::( + BufferMut::::from_iter([1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0]).freeze(), + Validity::AllValid, + ) + .into_array(); + let offsets = PrimitiveArray::new::( + BufferMut::::from_iter([0i32, 3, 6]).freeze(), + Validity::NonNullable, + ) + .into_array(); + let list = vortex::array::Array::::new(elements, offsets, Validity::NonNullable) + .into_array(); + + let err = list_to_vector_ext(list).unwrap_err().to_string(); + assert!( + err.contains("element type must be non-nullable"), + "unexpected error: {err}" + ); + } }