From f8b218986cf75a171b7bd51717ef01e106bcd4eb Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 11 Apr 2026 12:30:26 +0000
Subject: [PATCH 01/18] vortex-tensor: promote vector-search helpers to public
 API

Promote `compress_turboquant`, `build_constant_query_vector`, and
`build_similarity_search_tree` from the bench-only
`similarity_search_common` module into a new public
`vortex_tensor::vector_search` module so downstream benchmark crates
(and library users doing brute-force vector search) can reuse them
without duplicating the canonical recipe.

The existing similarity_search bench now delegates to these helpers
instead of inlining them, and adds three unit tests for the new
public surface: constant-query vector construction, end-to-end tree
execution to a BoolArray, and a TurboQuant-roundtrip ranking check.

Signed-off-by: Claude <noreply@anthropic.com>
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 .../benches/similarity_search_common/mod.rs   |  93 +-----
 vortex-tensor/public-api.lock                 |   8 +
 vortex-tensor/src/lib.rs                      |   2 +
 vortex-tensor/src/vector_search.rs            | 301 ++++++++++++++++++
 4 files changed, 321 insertions(+), 83 deletions(-)
 create mode 100644 vortex-tensor/src/vector_search.rs
diff --git a/vortex-tensor/benches/similarity_search_common/mod.rs b/vortex-tensor/benches/similarity_search_common/mod.rs
index c22cb5a9f08..ee0bab128ed 100644
--- a/vortex-tensor/benches/similarity_search_common/mod.rs
+++ b/vortex-tensor/benches/similarity_search_common/mod.rs
@@ -30,22 +30,14 @@ use vortex_array::ArrayRef;
 use vortex_array::ExecutionCtx;
 use vortex_array::IntoArray;
 use vortex_array::VortexSessionExecute;
-use vortex_array::arrays::ConstantArray;
 use vortex_array::arrays::Extension;
 use vortex_array::arrays::ExtensionArray;
 use vortex_array::arrays::FixedSizeListArray;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::extension::ExtensionArrayExt;
 use vortex_array::arrays::fixed_size_list::FixedSizeListArrayExt;
-use vortex_array::arrays::scalar_fn::ScalarFnArrayExt;
-use vortex_array::builtins::ArrayBuiltins;
-use vortex_array::dtype::DType;
-use vortex_array::dtype::Nullability;
-use vortex_array::dtype::PType;
 use vortex_array::dtype::extension::ExtDType;
 use vortex_array::extension::EmptyMetadata;
-use vortex_array::scalar::Scalar;
-use vortex_array::scalar_fn::fns::operators::Operator;
 use vortex_array::session::ArraySession;
 use vortex_array::validity::Validity;
 use vortex_btrblocks::BtrBlocksCompressor;
@@ -54,12 +46,9 @@ use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 use vortex_error::vortex_panic;
 use vortex_session::VortexSession;
-use vortex_tensor::encodings::turboquant::TurboQuantConfig;
-use vortex_tensor::encodings::turboquant::turboquant_encode_unchecked;
-use vortex_tensor::scalar_fns::cosine_similarity::CosineSimilarity;
-use vortex_tensor::scalar_fns::l2_denorm::L2Denorm;
-use vortex_tensor::scalar_fns::l2_denorm::normalize_as_l2_denorm;
 use vortex_tensor::vector::Vector;
+use vortex_tensor::vector_search::build_similarity_search_tree as public_build_similarity_search_tree;
+use vortex_tensor::vector_search::compress_turboquant as public_compress_turboquant;
 
 /// A shared [`VortexSession`] pre-loaded with the builtin [`ArraySession`] so both bench and
 /// example can create execution contexts cheaply.
@@ -146,25 +135,6 @@ pub fn extract_row_as_query(vectors: &ArrayRef, row: usize, dim: u32) -> Vec<f32
     slice[start..start + dim_usize].to_vec()
 }
 
-/// Build a `Vector<dim, f32>` extension array whose storage is a [`ConstantArray`] broadcasting a
-/// single query vector across `num_rows` rows. This is how we hand a single query vector to
-/// `CosineSimilarity` on the `rhs` side -- `ScalarFnArray` requires both children to have the
-/// same length, so we broadcast the query instead of hand-rolling a 1-row input.
-fn build_constant_query_vector(query: &[f32], num_rows: usize) -> VortexResult<ArrayRef> {
-    let element_dtype = DType::Primitive(PType::F32, Nullability::NonNullable);
-
-    let children: Vec<Scalar> = query
-        .iter()
-        .map(|&v| Scalar::primitive(v, Nullability::NonNullable))
-        .collect();
-    let storage_scalar = Scalar::fixed_size_list(element_dtype, children, Nullability::NonNullable);
-
-    let storage = ConstantArray::new(storage_scalar, num_rows).into_array();
-
-    let ext_dtype = ExtDType::<Vector>::try_new(EmptyMetadata, storage.dtype().clone())?.erased();
-    Ok(ExtensionArray::new(ext_dtype, storage).into_array())
-}
-
 /// Compresses a raw `Vector<dim, f32>` array with the default BtrBlocks pipeline.
 ///
 /// [`BtrBlocksCompressor`] walks into the extension array and recursively compresses the
@@ -175,36 +145,11 @@ pub fn compress_default(data: ArrayRef) -> VortexResult<ArrayRef> {
     BtrBlocksCompressor::default().compress(&data)
 }
 
-/// Compresses a raw `Vector<dim, f32>` array with the TurboQuant pipeline by hand, producing the
-/// same tree shape that
-/// [`vortex_tensor::encodings::turboquant::TurboQuantScheme`] would:
-///
-/// ```text
-/// L2Denorm(SorfTransform(FSL(Dict(codes, centroids))), norms)
-/// ```
-///
-/// Calling the encode helpers directly (instead of going through
-/// `BtrBlocksCompressorBuilder::with_turboquant()`) lets this example avoid depending on the
-/// `unstable_encodings` feature flag.
-///
-/// See `vortex-tensor/src/encodings/turboquant/tests/mod.rs::normalize_and_encode` for the same
-/// canonical recipe.
+/// Compresses a raw `Vector<dim, f32>` array with the TurboQuant pipeline. This is a thin
+/// wrapper around [`vortex_tensor::vector_search::compress_turboquant`] preserved for bench
+/// call-site compatibility.
 pub fn compress_turboquant(data: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef> {
-    let l2_denorm = normalize_as_l2_denorm(data, ctx)?;
-    let normalized = l2_denorm.child_at(0).clone();
-    let norms = l2_denorm.child_at(1).clone();
-    let num_rows = l2_denorm.len();
-
-    let normalized_ext = normalized
-        .as_opt::<Extension>()
-        .vortex_expect("normalized child should be an Extension array");
-
-    let config = TurboQuantConfig::default();
-    // SAFETY: `normalize_as_l2_denorm` guarantees every row is unit-norm (or zero), which is the
-    // invariant `turboquant_encode_unchecked` expects.
-    let tq = unsafe { turboquant_encode_unchecked(normalized_ext, &config, ctx) }?;
-
-    Ok(unsafe { L2Denorm::new_array_unchecked(tq, norms, num_rows) }?.into_array())
+    public_compress_turboquant(data, ctx)
 }
 
 /// Dispatch helper that builds the data array for the requested [`Variant`], starting from a
@@ -226,31 +171,13 @@ pub fn build_variant(
 }
 
 /// Build the lazy similarity-search array tree for a prepared data array and a single query
-/// vector. The returned tree is a boolean array of length `data.len()` where position `i` is
-/// `true` iff `cosine_similarity(data[i], query) > threshold`.
-///
-/// The tree shape is:
-///
-/// ```text
-/// Binary(Gt, [
-///     CosineSimilarity([data, ConstantArray(query_vec, n)]),
-///     ConstantArray(threshold, n),
-/// ])
-/// ```
-///
-/// This function does no execution; it is safe to call inside a benchmark setup closure.
+/// vector. Thin wrapper around
+/// [`vortex_tensor::vector_search::build_similarity_search_tree`] preserved for bench
+/// call-site compatibility.
 pub fn build_similarity_search_tree(
     data: ArrayRef,
     query: &[f32],
     threshold: f32,
 ) -> VortexResult<ArrayRef> {
-    let num_rows = data.len();
-    let query_vec = build_constant_query_vector(query, num_rows)?;
-
-    let cosine = CosineSimilarity::try_new_array(data, query_vec, num_rows)?.into_array();
-
-    let threshold_scalar = Scalar::primitive(threshold, Nullability::NonNullable);
-    let threshold_array = ConstantArray::new(threshold_scalar, num_rows).into_array();
-
-    cosine.binary(threshold_array, Operator::Gt)
+    public_build_similarity_search_tree(data, query, threshold)
 }
diff --git a/vortex-tensor/public-api.lock b/vortex-tensor/public-api.lock
index bec8df1cb29..90aea23a194 100644
--- a/vortex-tensor/public-api.lock
+++ b/vortex-tensor/public-api.lock
@@ -550,4 +550,12 @@ impl core::marker::Copy for vortex_tensor::vector::VectorMatcherMetadata
 
 impl core::marker::StructuralPartialEq for vortex_tensor::vector::VectorMatcherMetadata
 
+pub mod vortex_tensor::vector_search
+
+pub fn vortex_tensor::vector_search::build_constant_query_vector(query: &[f32], num_rows: usize) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_tensor::vector_search::build_similarity_search_tree(data: vortex_array::array::erased::ArrayRef, query: &[f32], threshold: f32) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_tensor::vector_search::compress_turboquant(data: vortex_array::array::erased::ArrayRef, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
 pub fn vortex_tensor::initialize(session: &vortex_session::VortexSession)
diff --git a/vortex-tensor/src/lib.rs b/vortex-tensor/src/lib.rs
index 3d3563aa8e4..b3cf6c21695 100644
--- a/vortex-tensor/src/lib.rs
+++ b/vortex-tensor/src/lib.rs
@@ -25,6 +25,8 @@ pub mod vector;
 
 pub mod encodings;
 
+pub mod vector_search;
+
 mod utils;
 
 /// Initialize the Vortex tensor library with a Vortex session.
diff --git a/vortex-tensor/src/vector_search.rs b/vortex-tensor/src/vector_search.rs
new file mode 100644
index 00000000000..37adbf27a61
--- /dev/null
+++ b/vortex-tensor/src/vector_search.rs
@@ -0,0 +1,301 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Reusable helpers for building brute-force vector similarity search expressions over
+//! [`Vector`] extension arrays.
+//!
+//! This module exposes three small building blocks that together make it straightforward to
+//! stand up a cosine-similarity-plus-threshold scan on top of a prepared data array:
+//!
+//! - [`compress_turboquant`] applies the canonical TurboQuant encoding pipeline
+//!   (`L2Denorm(SorfTransform(FSL(Dict(codes, centroids))), norms)`) to a raw
+//!   `Vector<dim, f32>` array without requiring the caller to plumb the
+//!   `unstable_encodings` feature flag on the `vortex` facade.
+//! - [`build_constant_query_vector`] wraps a single query vector into a
+//!   [`Vector`] extension array whose storage is a [`ConstantArray`] broadcast
+//!   across `num_rows` rows. This is the shape expected by
+//!   [`CosineSimilarity::try_new_array`] for the RHS of a database-vs-query scan.
+//! - [`build_similarity_search_tree`] wires everything together into a lazy
+//!   `Binary(Gt, [CosineSimilarity(data, query), threshold])` expression.
+//!
+//! Executing the tree from [`build_similarity_search_tree`] into a
+//! [`BoolArray`](vortex_array::arrays::BoolArray) yields one boolean per row indicating whether
+//! that row's cosine similarity to the query exceeds `threshold`.
+//!
+//! # Example
+//!
+//! ```ignore
+//! use vortex_array::{ArrayRef, VortexSessionExecute};
+//! use vortex_array::arrays::BoolArray;
+//! use vortex_session::VortexSession;
+//! use vortex_tensor::vector_search::{build_similarity_search_tree, compress_turboquant};
+//!
+//! fn run(session: &VortexSession, data: ArrayRef, query: &[f32]) -> anyhow::Result<()> {
+//!     let mut ctx = session.create_execution_ctx();
+//!     let data = compress_turboquant(data, &mut ctx)?;
+//!     let tree = build_similarity_search_tree(data, query, 0.8)?;
+//!     let _matches: BoolArray = tree.execute(&mut ctx)?;
+//!     Ok(())
+//! }
+//! ```
+//!
+//! [`Vector`]: crate::vector::Vector
+//! [`CosineSimilarity::try_new_array`]: crate::scalar_fns::cosine_similarity::CosineSimilarity::try_new_array
+
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::arrays::ConstantArray;
+use vortex_array::arrays::Extension;
+use vortex_array::arrays::ExtensionArray;
+use vortex_array::arrays::scalar_fn::ScalarFnArrayExt;
+use vortex_array::builtins::ArrayBuiltins;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::Nullability;
+use vortex_array::dtype::PType;
+use vortex_array::dtype::extension::ExtDType;
+use vortex_array::extension::EmptyMetadata;
+use vortex_array::scalar::Scalar;
+use vortex_array::scalar_fn::fns::operators::Operator;
+use vortex_error::VortexExpect;
+use vortex_error::VortexResult;
+use vortex_error::vortex_bail;
+
+use crate::encodings::turboquant::TurboQuantConfig;
+use crate::encodings::turboquant::turboquant_encode_unchecked;
+use crate::scalar_fns::cosine_similarity::CosineSimilarity;
+use crate::scalar_fns::l2_denorm::L2Denorm;
+use crate::scalar_fns::l2_denorm::normalize_as_l2_denorm;
+use crate::vector::Vector;
+
+/// Apply the canonical TurboQuant encoding pipeline to a `Vector<dim, f32>` array.
+///
+/// The returned array has the shape
+/// `L2Denorm(SorfTransform(FSL(Dict(codes, centroids))), norms)` — exactly what
+/// [`vortex_tensor::encodings::turboquant::TurboQuantScheme`] produces when invoked through
+/// `BtrBlocksCompressorBuilder::with_turboquant()`, but without requiring callers to enable
+/// the `unstable_encodings` feature on the `vortex` facade.
+///
+/// The input `data` must be a [`Vector`] extension array whose element type is `f32` and whose
+/// dimensionality is at least
+/// [`turboquant::MIN_DIMENSION`](crate::encodings::turboquant::MIN_DIMENSION). The TurboQuant
+/// configuration used is [`TurboQuantConfig::default()`] (8-bit codes, 3 SORF rounds, seed 42).
+///
+/// # Errors
+///
+/// Returns an error if `data` is not a [`Vector`] extension array, if normalization fails, or
+/// if the underlying TurboQuant encoder rejects the input shape.
+pub fn compress_turboquant(data: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef> {
+    let l2_denorm = normalize_as_l2_denorm(data, ctx)?;
+    let normalized = l2_denorm.child_at(0).clone();
+    let norms = l2_denorm.child_at(1).clone();
+    let num_rows = l2_denorm.len();
+
+    let Some(normalized_ext) = normalized.as_opt::<Extension>() else {
+        vortex_bail!("normalize_as_l2_denorm must produce an Extension array child");
+    };
+
+    let config = TurboQuantConfig::default();
+    // SAFETY: `normalize_as_l2_denorm` guarantees every row is unit-norm (or zero), which is
+    // the invariant `turboquant_encode_unchecked` expects.
+    let tq = unsafe { turboquant_encode_unchecked(normalized_ext, &config, ctx) }?;
+
+    Ok(unsafe { L2Denorm::new_array_unchecked(tq, norms, num_rows) }?.into_array())
+}
+
+/// Build a `Vector<dim, f32>` extension array whose storage is a [`ConstantArray`] broadcasting
+/// a single query vector across `num_rows` rows.
+///
+/// This is the shape expected for the RHS of a database-vs-query
+/// [`CosineSimilarity`](crate::scalar_fns::cosine_similarity::CosineSimilarity) scan: the
+/// `ScalarFnArray` contract requires both children to have the same length, so rather than
+/// hand-rolling a 1-row input we broadcast the query across the whole database.
+///
+/// # Errors
+///
+/// Returns an error if the [`Vector`] extension dtype rejects the constructed storage dtype.
+pub fn build_constant_query_vector(query: &[f32], num_rows: usize) -> VortexResult<ArrayRef> {
+    let element_dtype = DType::Primitive(PType::F32, Nullability::NonNullable);
+
+    let children: Vec<Scalar> = query
+        .iter()
+        .map(|&v| Scalar::primitive(v, Nullability::NonNullable))
+        .collect();
+    let storage_scalar = Scalar::fixed_size_list(element_dtype, children, Nullability::NonNullable);
+
+    let storage = ConstantArray::new(storage_scalar, num_rows).into_array();
+
+    let ext_dtype = ExtDType::<Vector>::try_new(EmptyMetadata, storage.dtype().clone())?.erased();
+    Ok(ExtensionArray::new(ext_dtype, storage).into_array())
+}
+
+/// Build the lazy similarity-search expression tree for a prepared database array and a
+/// single query vector.
+///
+/// The returned array is a lazy boolean expression of length `data.len()` whose position `i`
+/// is `true` iff `cosine_similarity(data[i], query) > threshold`. Executing it into a
+/// [`BoolArray`](vortex_array::arrays::BoolArray) runs the full scan.
+///
+/// The tree shape is:
+///
+/// ```text
+/// Binary(Gt, [
+///     CosineSimilarity([data, ConstantArray(query_vec, n)]),
+///     ConstantArray(threshold, n),
+/// ])
+/// ```
+///
+/// This function performs no execution; it is safe to call inside a benchmark setup closure.
+///
+/// # Errors
+///
+/// Returns an error if `query` has a length incompatible with `data`'s vector dimension, or
+/// if any of the intermediate array constructors fails.
+pub fn build_similarity_search_tree(
+    data: ArrayRef,
+    query: &[f32],
+    threshold: f32,
+) -> VortexResult<ArrayRef> {
+    let num_rows = data.len();
+    let query_vec = build_constant_query_vector(query, num_rows)?;
+
+    let cosine = CosineSimilarity::try_new_array(data, query_vec, num_rows)
+        .vortex_expect("cosine similarity accepts two matching Vector extension arrays")
+        .into_array();
+
+    let threshold_scalar = Scalar::primitive(threshold, Nullability::NonNullable);
+    let threshold_array = ConstantArray::new(threshold_scalar, num_rows).into_array();
+
+    cosine.binary(threshold_array, Operator::Gt)
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_array::ArrayRef;
+    use vortex_array::IntoArray;
+    use vortex_array::VortexSessionExecute;
+    use vortex_array::arrays::BoolArray;
+    use vortex_array::arrays::Extension;
+    use vortex_array::arrays::ExtensionArray;
+    use vortex_array::arrays::FixedSizeListArray;
+    use vortex_array::arrays::PrimitiveArray;
+    use vortex_array::arrays::bool::BoolArrayExt;
+    use vortex_array::dtype::extension::ExtDType;
+    use vortex_array::extension::EmptyMetadata;
+    use vortex_array::session::ArraySession;
+    use vortex_array::validity::Validity;
+    use vortex_buffer::BufferMut;
+    use vortex_error::VortexResult;
+    use vortex_session::VortexSession;
+
+    use super::build_constant_query_vector;
+    use super::build_similarity_search_tree;
+    use super::compress_turboquant;
+    use crate::vector::Vector;
+
+    /// Build a `Vector<DIM, f32>` extension array from a flat f32 slice. Each contiguous
+    /// group of `DIM` values becomes one row.
+    fn vector_array(dim: u32, values: &[f32]) -> VortexResult<ArrayRef> {
+        let dim_usize = dim as usize;
+        assert_eq!(values.len() % dim_usize, 0);
+        let num_rows = values.len() / dim_usize;
+
+        let mut buf = BufferMut::<f32>::with_capacity(values.len());
+        for &v in values {
+            buf.push(v);
+        }
+        let elements = PrimitiveArray::new::<f32>(buf.freeze(), Validity::NonNullable);
+        let fsl = FixedSizeListArray::try_new(
+            elements.into_array(),
+            dim,
+            Validity::NonNullable,
+            num_rows,
+        )?;
+
+        let ext_dtype = ExtDType::<Vector>::try_new(EmptyMetadata, fsl.dtype().clone())?.erased();
+        Ok(ExtensionArray::new(ext_dtype, fsl.into_array()).into_array())
+    }
+
+    fn test_session() -> VortexSession {
+        VortexSession::empty().with::<ArraySession>()
+    }
+
+    #[test]
+    fn constant_query_vector_has_vector_extension_dtype() -> VortexResult<()> {
+        let query = vec![1.0f32, 0.0, 0.0, 0.0];
+        let rhs = build_constant_query_vector(&query, 5)?;
+
+        assert_eq!(rhs.len(), 5);
+        assert!(rhs.as_opt::<Extension>().is_some());
+        Ok(())
+    }
+
+    #[test]
+    fn similarity_search_tree_executes_to_bool_array() -> VortexResult<()> {
+        // 4 rows of 3-dim vectors; the first and last match the query [1, 0, 0].
+        let data = vector_array(
+            3,
+            &[
+                1.0, 0.0, 0.0, //
+                0.0, 1.0, 0.0, //
+                0.0, 0.0, 1.0, //
+                1.0, 0.0, 0.0, //
+            ],
+        )?;
+        let query = [1.0f32, 0.0, 0.0];
+
+        let tree = build_similarity_search_tree(data, &query, 0.5)?;
+        let mut ctx = test_session().create_execution_ctx();
+        let result: BoolArray = tree.execute(&mut ctx)?;
+
+        let bits = result.to_bit_buffer();
+        assert_eq!(bits.len(), 4);
+        assert!(bits.value(0));
+        assert!(!bits.value(1));
+        assert!(!bits.value(2));
+        assert!(bits.value(3));
+        Ok(())
+    }
+
+    #[test]
+    fn turboquant_roundtrip_preserves_ranking() -> VortexResult<()> {
+        // Build 6 rows of 128-dim vectors where row 0 is highly correlated with the query.
+        // TurboQuant should preserve the "row 0 is the best match" ordering.
+        const DIM: u32 = 128;
+        const NUM_ROWS: usize = 6;
+
+        let mut values = Vec::<f32>::with_capacity(NUM_ROWS * DIM as usize);
+        let query: Vec<f32> = (0..DIM as usize)
+            .map(|i| ((i as f32) * 0.017).sin())
+            .collect();
+
+        // Row 0: identical to query (cosine=1.0)
+        values.extend_from_slice(&query);
+        // Row 1: query + noise
+        for (i, q) in query.iter().enumerate() {
+            values.push(q + 0.05 * ((i as f32) * 0.03).cos());
+        }
+        // Rows 2..6: unrelated patterns
+        for row in 2..NUM_ROWS {
+            for i in 0..DIM as usize {
+                values.push(((row as f32 * 1.3 + i as f32) * 0.07).sin());
+            }
+        }
+
+        let data = vector_array(DIM, &values)?;
+        let mut ctx = test_session().create_execution_ctx();
+        let compressed = compress_turboquant(data, &mut ctx)?;
+        assert_eq!(compressed.len(), NUM_ROWS);
+
+        // Build a tree with a low threshold so row 0 (cosine=1.0 exact) matches.
+        let tree = build_similarity_search_tree(compressed, &query, 0.95)?;
+        let result: BoolArray = tree.execute(&mut ctx)?;
+        let bits = result.to_bit_buffer();
+        assert_eq!(bits.len(), NUM_ROWS);
+        assert!(
+            bits.value(0),
+            "row 0 (identical to query) must match at threshold 0.95 even after TurboQuant"
+        );
+        Ok(())
+    }
+}

From b30bba29d6543b20f1e8d028fd0164961c0cbcbe Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 11 Apr 2026 12:40:28 +0000
Subject: [PATCH 02/18] vortex-bench: add VortexTurboQuant format and
 list_to_vector_ext utility

Adds `Format::VortexTurboQuant` as a new Format enum variant, parallel to
`VortexCompact`, so downstream dashboards can distinguish TurboQuant-encoded
Vortex results from the generic BtrBlocks-compressed OnDiskVortex layout
via the existing `Target { engine, format }` grouping key.

Also adds `vortex_bench::conversions::list_to_vector_ext`: a utility that
rewraps a `List<float>` column (the shape an `emb` column takes when
ingested from parquet via `parquet_to_vortex_chunks`) as
`Extension<Vector>(FixedSizeList<T, D>)`. This is the only piece of
conversion glue needed to bring a VectorDBBench-style parquet embedding
column into a form the vector-search scalar functions expect. It handles
both single-list and chunked inputs, validates uniform stride, and
rejects nullable, non-float, or mismatched-length inputs.

`vortex-bench` gains a direct dependency on `vortex-tensor` for the
`Vector` extension type.

Signed-off-by: Claude <noreply@anthropic.com>
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 Cargo.lock                       |   1 +
 vortex-bench/Cargo.toml          |   1 +
 vortex-bench/src/conversions.rs  | 238 +++++++++++++++++++++++++++++++
 vortex-bench/src/lib.rs          |   9 ++
 vortex-bench/src/measurements.rs |   9 +-
 5 files changed, 256 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5ce1bc853cc..424eebac8f6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10241,6 +10241,7 @@ dependencies = [
  "url",
  "uuid",
  "vortex",
+ "vortex-tensor",
 ]
 
 [[package]]
diff --git a/vortex-bench/Cargo.toml b/vortex-bench/Cargo.toml
index 62d302f12e1..1859a64a16e 100644
--- a/vortex-bench/Cargo.toml
+++ b/vortex-bench/Cargo.toml
@@ -63,3 +63,4 @@ vortex = { workspace = true, features = [
     "tokio",
     "zstd",
 ] }
+vortex-tensor = { workspace = true }
diff --git a/vortex-bench/src/conversions.rs b/vortex-bench/src/conversions.rs
index 3f21ab30ba0..13e295e13c4 100644
--- a/vortex-bench/src/conversions.rs
+++ b/vortex-bench/src/conversions.rs
@@ -21,17 +21,28 @@ use vortex::VortexSessionDefault;
 use vortex::array::ArrayRef;
 use vortex::array::IntoArray;
 use vortex::array::VortexSessionExecute;
+use vortex::array::arrays::Chunked;
 use vortex::array::arrays::ChunkedArray;
+use vortex::array::arrays::ExtensionArray;
+use vortex::array::arrays::FixedSizeListArray;
+use vortex::array::arrays::List;
+use vortex::array::arrays::chunked::ChunkedArrayExt;
+use vortex::array::arrays::list::ListArrayExt;
 use vortex::array::arrow::FromArrowArray;
 use vortex::array::builders::builder_with_capacity;
+use vortex::array::extension::EmptyMetadata;
 use vortex::array::stream::ArrayStreamAdapter;
 use vortex::array::stream::ArrayStreamExt;
+use vortex::array::validity::Validity;
 use vortex::dtype::DType;
 use vortex::dtype::arrow::FromArrowType;
+use vortex::dtype::extension::ExtDType;
 use vortex::error::VortexResult;
+use vortex::error::vortex_bail;
 use vortex::error::vortex_err;
 use vortex::file::WriteOptionsSessionExt;
 use vortex::session::VortexSession;
+use vortex_tensor::vector::Vector;
 
 use crate::CompactionStrategy;
 use crate::Format;
@@ -222,3 +233,230 @@ pub async fn write_parquet_as_vortex(
     })
     .await
 }
+
+/// Rewrap a list-of-float column as a [`vortex_tensor::vector::Vector`] extension array.
+///
+/// Parquet has no fixed-size list logical type, so an embedding column ingested via
+/// [`parquet_to_vortex_chunks`] arrives as `List<f32>` (or `List<f64>` / `List<f16>`) even
+/// when every row has the same length. This helper validates that every list in `input`
+/// has the same length `D` and reconstructs the column as
+/// `Extension<Vector>(FixedSizeList<T, D>)` — the shape expected by the vector-search
+/// scalar functions in `vortex-tensor`.
+///
+/// The input may be either a single [`List`] array or a [`Chunked`] array of lists (the
+/// common case after `parquet_to_vortex_chunks`). Chunked inputs are converted chunk-by-chunk
+/// and reassembled as a [`ChunkedArray`] of `Extension<Vector>`.
+///
+/// # Errors
+///
+/// Returns an error if:
+/// - `input` is not a `List` or `Chunked` array.
+/// - The element type is not a non-nullable float primitive (`f16`, `f32`, or `f64`).
+/// - Any row has a different length than the first row.
+/// - The list validity is nullable (vector elements cannot be null at the row level).
+/// - The input has zero rows (the dimension cannot be inferred from empty input).
+pub fn list_to_vector_ext(input: ArrayRef) -> VortexResult<ArrayRef> {
+    if let Some(chunked) = input.as_opt::<Chunked>() {
+        let converted: Vec<ArrayRef> = chunked
+            .iter_chunks()
+            .map(|chunk| list_to_vector_ext(chunk.clone()))
+            .collect::<VortexResult<_>>()?;
+        if converted.is_empty() {
+            vortex_bail!("list_to_vector_ext: chunked input has no chunks");
+        }
+        return Ok(ChunkedArray::from_iter(converted).into_array());
+    }
+
+    let Some(list) = input.as_opt::<List>() else {
+        vortex_bail!(
+            "list_to_vector_ext expects a List array, got dtype {}",
+            input.dtype()
+        );
+    };
+
+    if !matches!(
+        list.list_validity(),
+        Validity::NonNullable | Validity::AllValid
+    ) {
+        vortex_bail!(
+            "list_to_vector_ext: list rows must be non-nullable for Vector extension wrapping"
+        );
+    }
+
+    let element_dtype = list.element_dtype().clone();
+    let DType::Primitive(ptype, elem_nullability) = &element_dtype else {
+        vortex_bail!(
+            "list_to_vector_ext: element dtype must be a primitive float, got {}",
+            element_dtype
+        );
+    };
+    if !ptype.is_float() {
+        vortex_bail!(
+            "list_to_vector_ext: element type must be float (f16/f32/f64), got {}",
+            ptype
+        );
+    }
+    if elem_nullability.is_nullable() {
+        vortex_bail!(
+            "list_to_vector_ext: element type must be non-nullable, got nullable {}",
+            ptype
+        );
+    }
+
+    let num_rows = input.len();
+    if num_rows == 0 {
+        vortex_bail!("list_to_vector_ext: cannot infer vector dimension from empty input");
+    }
+
+    let first_start = list.offset_at(0)?;
+    let first_end = list.offset_at(1)?;
+    let dim = first_end.checked_sub(first_start).ok_or_else(|| {
+        vortex_err!("list_to_vector_ext: offsets are not monotonically increasing")
+    })?;
+    if dim == 0 {
+        vortex_bail!("list_to_vector_ext: first row has zero elements");
+    }
+
+    for i in 1..num_rows {
+        let start = list.offset_at(i)?;
+        let end = list.offset_at(i + 1)?;
+        let row_len = end.checked_sub(start).ok_or_else(|| {
+            vortex_err!("list_to_vector_ext: offsets are not monotonically increasing")
+        })?;
+        if row_len != dim {
+            vortex_bail!(
+                "list_to_vector_ext: row {} has length {} but expected {}",
+                i,
+                row_len,
+                dim
+            );
+        }
+    }
+
+    let elements = list.sliced_elements()?;
+    let expected_elements = num_rows
+        .checked_mul(dim)
+        .ok_or_else(|| vortex_err!("list_to_vector_ext: num_rows * dim overflows usize"))?;
+    if elements.len() != expected_elements {
+        vortex_bail!(
+            "list_to_vector_ext: elements buffer has length {} but expected {}",
+            elements.len(),
+            expected_elements
+        );
+    }
+
+    let dim_u32 = u32::try_from(dim)
+        .map_err(|_| vortex_err!("list_to_vector_ext: dimension {dim} does not fit in u32"))?;
+
+    let fsl = FixedSizeListArray::try_new(elements, dim_u32, Validity::NonNullable, num_rows)?;
+    let ext_dtype = ExtDType::<Vector>::try_new(EmptyMetadata, fsl.dtype().clone())?.erased();
+    Ok(ExtensionArray::new(ext_dtype, fsl.into_array()).into_array())
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex::array::IntoArray;
+    use vortex::array::arrays::Extension;
+    use vortex::array::arrays::List;
+    use vortex::array::arrays::PrimitiveArray;
+    use vortex::array::arrays::extension::ExtensionArrayExt;
+    use vortex::array::validity::Validity;
+    use vortex::buffer::BufferMut;
+    use vortex::dtype::DType;
+    use vortex::dtype::Nullability;
+    use vortex::dtype::PType;
+
+    use super::list_to_vector_ext;
+
+    fn list_f32(rows: &[&[f32]]) -> vortex::array::ArrayRef {
+        let mut elements = BufferMut::<f32>::with_capacity(rows.iter().map(|r| r.len()).sum());
+        let mut offsets = BufferMut::<i32>::with_capacity(rows.len() + 1);
+        offsets.push(0);
+        for row in rows {
+            for &v in row.iter() {
+                elements.push(v);
+            }
+            offsets.push(i32::try_from(elements.len()).unwrap());
+        }
+
+        let elements_array =
+            PrimitiveArray::new::<f32>(elements.freeze(), Validity::NonNullable).into_array();
+        let offsets_array =
+            PrimitiveArray::new::<i32>(offsets.freeze(), Validity::NonNullable).into_array();
+        vortex::array::Array::<List>::new(elements_array, offsets_array, Validity::NonNullable)
+            .into_array()
+    }
+
+    #[test]
+    fn uniform_list_becomes_vector_extension() {
+        let list = list_f32(&[&[1.0, 2.0, 3.0], &[4.0, 5.0, 6.0], &[7.0, 8.0, 9.0]]);
+        let wrapped = list_to_vector_ext(list).unwrap();
+        assert_eq!(wrapped.len(), 3);
+        let ext = wrapped.as_opt::<Extension>().expect("returns Extension");
+        assert!(matches!(
+            ext.storage_array().dtype(),
+            DType::FixedSizeList(_, 3, _)
+        ));
+    }
+
+    #[test]
+    fn mismatched_row_length_is_rejected() {
+        let list = list_f32(&[&[1.0, 2.0, 3.0], &[4.0, 5.0]]);
+        let err = list_to_vector_ext(list).unwrap_err().to_string();
+        assert!(
+            err.contains("row 1 has length 2 but expected 3"),
+            "unexpected error: {err}",
+        );
+    }
+
+    #[test]
+    fn non_list_input_is_rejected() {
+        let primitive = PrimitiveArray::new::<f32>(
+            BufferMut::<f32>::from_iter([1.0f32, 2.0, 3.0]).freeze(),
+            Validity::NonNullable,
+        )
+        .into_array();
+        let err = list_to_vector_ext(primitive).unwrap_err().to_string();
+        assert!(
+            err.contains("expects a List array"),
+            "unexpected error: {err}"
+        );
+    }
+
+    #[test]
+    fn empty_input_is_rejected() {
+        let list = list_f32(&[]);
+        let err = list_to_vector_ext(list).unwrap_err().to_string();
+        assert!(
+            err.contains("cannot infer vector dimension from empty input"),
+            "unexpected error: {err}",
+        );
+    }
+
+    #[test]
+    fn non_float_element_type_is_rejected() {
+        // Build a List<i32>.
+        let elements = PrimitiveArray::new::<i32>(
+            BufferMut::<i32>::from_iter([1i32, 2, 3, 4]).freeze(),
+            Validity::NonNullable,
+        )
+        .into_array();
+        let offsets = PrimitiveArray::new::<i32>(
+            BufferMut::<i32>::from_iter([0i32, 2, 4]).freeze(),
+            Validity::NonNullable,
+        )
+        .into_array();
+        let list = vortex::array::Array::<List>::new(elements, offsets, Validity::NonNullable)
+            .into_array();
+
+        let err = list_to_vector_ext(list).unwrap_err().to_string();
+        assert!(
+            err.contains("element type must be float"),
+            "unexpected error: {err}",
+        );
+
+        // The unused Nullability / PType imports exist to make the intent clear; suppress
+        // the dead-code warning by referencing them in a no-op expression.
+        let _ = (Nullability::NonNullable, PType::I32);
+    }
+}
diff --git a/vortex-bench/src/lib.rs b/vortex-bench/src/lib.rs
index af0d3fdef30..b766e2e6a4c 100644
--- a/vortex-bench/src/lib.rs
+++ b/vortex-bench/src/lib.rs
@@ -138,6 +138,13 @@ pub enum Format {
     #[clap(name = "vortex-compact")]
     #[serde(rename = "vortex-compact")]
     VortexCompact,
+    /// Vortex file with the TurboQuant lossy vector-quantization encoding applied to
+    /// [`vortex_tensor::vector::Vector`] columns. Used by the vector-search benchmark to
+    /// distinguish TurboQuant-encoded results from the generic BtrBlocks-compressed
+    /// [`Format::OnDiskVortex`] layout in downstream dashboards.
+    #[clap(name = "vortex-turboquant")]
+    #[serde(rename = "vortex-turboquant")]
+    VortexTurboQuant,
     #[clap(name = "duckdb")]
     #[serde(rename = "duckdb")]
     OnDiskDuckDB,
@@ -177,6 +184,7 @@ impl Format {
             Format::Parquet => "parquet",
             Format::OnDiskVortex => "vortex-file-compressed",
             Format::VortexCompact => "vortex-compact",
+            Format::VortexTurboQuant => "vortex-turboquant",
             Format::OnDiskDuckDB => "duckdb",
             Format::Lance => "lance",
         }
@@ -189,6 +197,7 @@ impl Format {
             Format::Parquet => "parquet",
             Format::OnDiskVortex => "vortex",
             Format::VortexCompact => "vortex",
+            Format::VortexTurboQuant => "vortex",
             Format::OnDiskDuckDB => "duckdb",
             Format::Lance => "lance",
         }
diff --git a/vortex-bench/src/measurements.rs b/vortex-bench/src/measurements.rs
index f49349cd95e..8629d38d598 100644
--- a/vortex-bench/src/measurements.rs
+++ b/vortex-bench/src/measurements.rs
@@ -348,10 +348,13 @@ impl ToJson for CompressionTimingMeasurement {
     fn to_json(&self) -> serde_json::Value {
         let (name, engine) = match self.format {
             Format::OnDiskVortex => (self.name.to_string(), Engine::Vortex),
+            Format::VortexTurboQuant => {
+                (format!("vortex-turboquant {}", self.name), Engine::Vortex)
+            }
             Format::Parquet => (format!("parquet_rs-zstd {}", self.name), Engine::Arrow),
             Format::Lance => (format!("lance {}", self.name), Engine::Arrow),
             _ => vortex_panic!(
-                "CompressionTimingMeasurement only supports vortex, lance, and parquet formats"
+                "CompressionTimingMeasurement only supports vortex, vortex-turboquant, lance, and parquet formats"
             ),
         };
 
@@ -392,7 +395,9 @@ pub struct CustomUnitMeasurement {
 impl ToJson for CustomUnitMeasurement {
     fn to_json(&self) -> serde_json::Value {
         let engine = match self.format {
-            Format::OnDiskVortex | Format::VortexCompact => Engine::Vortex,
+            Format::OnDiskVortex | Format::VortexCompact | Format::VortexTurboQuant => {
+                Engine::Vortex
+            }
             Format::Parquet => Engine::Arrow,
             Format::Lance => Engine::Arrow,
             _ => Engine::Vortex, // Default to Vortex for other formats.

From 4a9f5e1b20da24c01dba48d9a1c59e8a4c582896 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 11 Apr 2026 12:42:17 +0000
Subject: [PATCH 03/18] vortex-bench: add VectorDataset for vector-search
 benchmark datasets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce `VectorDataset`, a new `Dataset`-trait implementor that wires
public VectorDBBench-hosted embedding corpora into the vortex-bench
data-management machinery. The first variant is `CohereSmall`
(Cohere wiki-22-12, 100K rows × 768 dims, cosine metric, ~150 MB
zstd-parquet), targeting a CI-friendly size. Follow-up variants
(CohereMedium, OpenAI*, SIFT*, etc.) can be added alongside without
structural change.

Each variant carries its upstream `assets.zilliz.com` parquet URL,
dimension, row count, and curated distance metric. The download path
reuses the existing `download_data` / `idempotent_async` infrastructure
from `DownloadableDataset`, caching under `{name}/{name}.parquet` in
the local benchmark data directory.

Signed-off-by: Claude <noreply@anthropic.com>
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 vortex-bench/src/lib.rs            |   1 +
 vortex-bench/src/vector_dataset.rs | 159 +++++++++++++++++++++++++++++
 2 files changed, 160 insertions(+)
 create mode 100644 vortex-bench/src/vector_dataset.rs

diff --git a/vortex-bench/src/lib.rs b/vortex-bench/src/lib.rs
index b766e2e6a4c..e2829246872 100644
--- a/vortex-bench/src/lib.rs
+++ b/vortex-bench/src/lib.rs
@@ -53,6 +53,7 @@ pub mod statpopgen;
 pub mod tpcds;
 pub mod tpch;
 pub mod utils;
+pub mod vector_dataset;
 
 pub use benchmark::Benchmark;
 pub use benchmark::TableSpec;
diff --git a/vortex-bench/src/vector_dataset.rs b/vortex-bench/src/vector_dataset.rs
new file mode 100644
index 00000000000..d6a97d20ba9
--- /dev/null
+++ b/vortex-bench/src/vector_dataset.rs
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Datasets used by the vector-search benchmark.
+//!
+//! These are a subset of the public VectorDBBench
+//! (<https://github.com/zilliztech/VectorDBBench>) datasets — MIT-licensed canonical
+//! embedding corpora published by Zilliz under
+//! `https://assets.zilliz.com/benchmark/<dir>/`. Each dataset is distributed as one or more
+//! parquet files with a `emb: list<float>` column (the raw embedding vectors) and an
+//! `id: int64` column.
+//!
+//! The URL constants below point at the upstream Zilliz bucket. For CI runs we recommend
+//! mirroring these files into an internal bucket first to avoid repeated egress charges on
+//! a third-party bucket — mirror setup is a one-off manual operation and documented in the
+//! vector-search-bench crate README.
+
+use std::path::PathBuf;
+
+use anyhow::Result;
+use async_trait::async_trait;
+use tokio::fs::File;
+use vortex::array::ArrayRef;
+use vortex::array::IntoArray;
+use vortex::array::stream::ArrayStreamExt;
+use vortex::file::OpenOptionsSessionExt;
+use vortex::file::WriteOptionsSessionExt;
+
+use crate::IdempotentPath;
+use crate::SESSION;
+use crate::conversions::parquet_to_vortex_chunks;
+use crate::datasets::Dataset;
+use crate::datasets::data_downloads::download_data;
+use crate::idempotent_async;
+
+/// A public embedding-vector dataset used by the vector-search benchmark.
+///
+/// Each variant is one of the canonical VectorDBBench corpora, distributed as parquet under
+/// the Zilliz public benchmark bucket. The smaller `*Small` sizes are appropriate for CI
+/// runs; the larger sizes are intended for local / on-demand experiments.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum VectorDataset {
+    /// Cohere wiki-22-12, 100K rows × 768 dims, cosine metric. ~307 MB raw / ~150 MB
+    /// zstd-parquet — the default CI-friendly size.
+    CohereSmall,
+}
+
+impl VectorDataset {
+    /// The upstream URL for this dataset's canonical train-split parquet file.
+    ///
+    /// **CI note**: point at an internal mirror before enabling this benchmark in CI.
+    pub fn parquet_url(&self) -> &'static str {
+        match self {
+            VectorDataset::CohereSmall => {
+                "https://assets.zilliz.com/benchmark/cohere_small_100k/train.parquet"
+            }
+        }
+    }
+
+    /// Fixed vector dimensionality for this dataset.
+    pub fn dim(&self) -> u32 {
+        match self {
+            VectorDataset::CohereSmall => 768,
+        }
+    }
+
+    /// Expected number of rows in the train split.
+    pub fn num_rows(&self) -> usize {
+        match self {
+            VectorDataset::CohereSmall => 100_000,
+        }
+    }
+
+    /// The distance metric the upstream dataset was curated for. v1 only wires cosine, so
+    /// this is informational today.
+    pub fn metric(&self) -> VectorMetric {
+        match self {
+            VectorDataset::CohereSmall => VectorMetric::Cosine,
+        }
+    }
+}
+
+/// Distance metric a dataset was curated for. The vector-search benchmark only wires cosine
+/// today, but having this explicit makes it obvious when a future dataset should be paired
+/// with L2 or inner-product instead.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum VectorMetric {
+    /// Cosine similarity: `dot(a, b) / (||a|| * ||b||)`.
+    Cosine,
+    /// Squared L2 distance: `sum((a - b)^2)`.
+    L2,
+    /// Inner product: `dot(a, b)`.
+    InnerProduct,
+}
+
+#[async_trait]
+impl Dataset for VectorDataset {
+    fn name(&self) -> &str {
+        match self {
+            VectorDataset::CohereSmall => "cohere-small",
+        }
+    }
+
+    async fn to_parquet_path(&self) -> Result<PathBuf> {
+        let dir = format!("{}/", self.name()).to_data_path();
+        let parquet = dir.join(format!("{}.parquet", self.name()));
+        download_data(parquet.clone(), self.parquet_url()).await?;
+        Ok(parquet)
+    }
+
+    async fn to_vortex_array(&self) -> Result<ArrayRef> {
+        let parquet = self.to_parquet_path().await?;
+        let dir = format!("{}/", self.name()).to_data_path();
+        let vortex = dir.join(format!("{}.vortex", self.name()));
+
+        let data = parquet_to_vortex_chunks(parquet).await?;
+        idempotent_async(&vortex, async |path| -> Result<()> {
+            SESSION
+                .write_options()
+                .write(
+                    &mut File::create(path)
+                        .await
+                        .map_err(|e| anyhow::anyhow!("Failed to create file: {}", e))?,
+                    data.into_array().to_array_stream(),
+                )
+                .await
+                .map_err(|e| anyhow::anyhow!("Failed to write vortex file: {}", e))?;
+            Ok(())
+        })
+        .await?;
+
+        Ok(SESSION
+            .open_options()
+            .open_path(vortex.as_path())
+            .await?
+            .scan()?
+            .into_array_stream()?
+            .read_all()
+            .await?)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::VectorDataset;
+    use super::VectorMetric;
+    use crate::datasets::Dataset;
+
+    #[test]
+    fn cohere_small_metadata() {
+        let ds = VectorDataset::CohereSmall;
+        assert_eq!(ds.name(), "cohere-small");
+        assert_eq!(ds.dim(), 768);
+        assert_eq!(ds.num_rows(), 100_000);
+        assert_eq!(ds.metric(), VectorMetric::Cosine);
+        assert!(ds.parquet_url().ends_with("/train.parquet"));
+        assert!(ds.parquet_url().contains("cohere_small_100k"));
+    }
+}

From c6e5dc007fcf0aa76e3ddddc0be03d13979be11f Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 11 Apr 2026 12:53:29 +0000
Subject: [PATCH 04/18] Add vector-search-bench crate with Vortex
 similarity-search variants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a new standalone benchmark crate at
`benchmarks/vector-search-bench/` that measures brute-force cosine
similarity search over VectorDBBench-style public embedding corpora.
This commit wires up three Vortex variants:

- `vortex-uncompressed`: raw `Vector<dim, f32>` with no encoding-level
  compression applied. Baseline for Vortex variants.
- `vortex-default`: `BtrBlocksCompressor::default()` applied to the FSL
  storage — generic lossless compression for float vectors.
- `vortex-turboquant`: the full `L2Denorm(SorfTransform(FSL(Dict)))`
  pipeline via `compress_turboquant`. Lossy; in-memory `.nbytes()` is
  used as the reported size until `L2Denorm` serialization lands.

Four measurements are emitted per `(dataset, variant)` pair:
1. Compressed size (bytes) — on-disk `.vortex` file size for the
   lossless variants, in-memory footprint for TurboQuant.
2. Full-scan decode time — executes the column to a materialized
   `FixedSizeListArray`, forcing the entire decompression/dequantization
   path for each variant.
3. Cosine-similarity execute time — runs `CosineSimilarity(data, query)`
   into a materialized f32 array.
4. Filter execute time — runs `Binary(Gt, [cosine, threshold])` into a
   `BoolArray`. This is the end-to-end "query pushdown" path.

Output goes through the existing `vortex_bench::measurements` types so
results flow through the standard `gh-json` pipeline and appear on the
CI dashboard alongside compress-bench / random-access-bench results.
`Variant::VortexUncompressed` and `Variant::VortexDefault` both report
as `Format::OnDiskVortex`; `Variant::VortexTurboQuant` reports as
`Format::VortexTurboQuant`, letting downstream consumers distinguish
them via the existing `Target { engine, format }` grouping key.

The crate ships with a unit test that builds a 64×128 synthetic
`Vector` array, runs all three variants through `prepare_variant` +
`run_timings`, and verifies every variant reports a non-zero size and
non-zero timings. Parquet-Arrow baseline and Recall@10 quality
measurements are deferred to a follow-up commit; this commit
intentionally keeps the Vortex-only core focused.

Signed-off-by: Claude <noreply@anthropic.com>
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 Cargo.lock                                 |  17 +
 Cargo.toml                                 |   1 +
 benchmarks/vector-search-bench/Cargo.toml  |  31 ++
 benchmarks/vector-search-bench/src/lib.rs  | 494 +++++++++++++++++++++
 benchmarks/vector-search-bench/src/main.rs | 173 ++++++++
 5 files changed, 716 insertions(+)
 create mode 100644 benchmarks/vector-search-bench/Cargo.toml
 create mode 100644 benchmarks/vector-search-bench/src/lib.rs
 create mode 100644 benchmarks/vector-search-bench/src/main.rs

diff --git a/Cargo.lock b/Cargo.lock
index 424eebac8f6..f192874ce88 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10052,6 +10052,23 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
 
+[[package]]
+name = "vector-search-bench"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "clap",
+ "indicatif",
+ "itertools 0.14.0",
+ "tokio",
+ "tracing",
+ "vortex",
+ "vortex-bench",
+ "vortex-btrblocks",
+ "vortex-tensor",
+]
+
 [[package]]
 name = "version_check"
 version = "0.9.5"
diff --git a/Cargo.toml b/Cargo.toml
index 9853cf94ed9..0c904489f58 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -59,6 +59,7 @@ members = [
     "benchmarks/datafusion-bench",
     "benchmarks/duckdb-bench",
     "benchmarks/random-access-bench",
+    "benchmarks/vector-search-bench",
 ]
 exclude = ["java/testfiles", "wasm-test"]
 resolver = "2"
diff --git a/benchmarks/vector-search-bench/Cargo.toml b/benchmarks/vector-search-bench/Cargo.toml
new file mode 100644
index 00000000000..f881a5542b1
--- /dev/null
+++ b/benchmarks/vector-search-bench/Cargo.toml
@@ -0,0 +1,31 @@
+[package]
+name = "vector-search-bench"
+description = "Vector similarity search benchmarks for Vortex on public embedding datasets"
+authors.workspace = true
+categories.workspace = true
+edition.workspace = true
+homepage.workspace = true
+include.workspace = true
+keywords.workspace = true
+license.workspace = true
+readme.workspace = true
+repository.workspace = true
+rust-version.workspace = true
+version.workspace = true
+publish = false
+
+[dependencies]
+anyhow = { workspace = true }
+async-trait = { workspace = true }
+clap = { workspace = true, features = ["derive"] }
+indicatif = { workspace = true }
+itertools = { workspace = true }
+tokio = { workspace = true, features = ["full"] }
+tracing = { workspace = true }
+vortex = { workspace = true }
+vortex-bench = { workspace = true }
+vortex-btrblocks = { workspace = true }
+vortex-tensor = { workspace = true }
+
+[lints]
+workspace = true
diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs
new file mode 100644
index 00000000000..18356c8ee86
--- /dev/null
+++ b/benchmarks/vector-search-bench/src/lib.rs
@@ -0,0 +1,494 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Vector similarity-search benchmark core.
+//!
+//! This crate measures four quantities for each `(dataset, variant)` pair:
+//!
+//! 1. **Compressed storage size** (bytes on disk, or in-memory `.nbytes()` for variants that
+//!    don't yet serialize — currently just [`Variant::VortexTurboQuant`]).
+//! 2. **Full-scan decode time** — executing the `Vector<dim, f32>` column into a
+//!    materialized [`vortex::array::arrays::FixedSizeListArray`].
+//! 3. **Cosine-similarity execute time** — executing
+//!    `CosineSimilarity(data, const_query)` into a materialized f32 primitive array.
+//! 4. **Filter execute time** — executing
+//!    `Binary(Gt, [CosineSimilarity, threshold])` into a
+//!    [`vortex::array::arrays::BoolArray`].
+//!
+//! Measurements are emitted via the existing `vortex_bench::measurements` types so that
+//! the benchmark results flow through the standard `gh-json` pipeline and appear in the
+//! CI dashboard alongside compress-bench / random-access-bench results.
+
+use std::time::Duration;
+use std::time::Instant;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::bail;
+use clap::ValueEnum;
+use vortex::array::ArrayRef;
+use vortex::array::IntoArray;
+use vortex::array::VortexSessionExecute;
+use vortex::array::arrays::BoolArray;
+use vortex::array::arrays::FixedSizeListArray;
+use vortex::array::arrays::PrimitiveArray;
+use vortex::error::VortexExpect;
+use vortex::session::VortexSession;
+use vortex_bench::Format;
+use vortex_bench::SESSION;
+use vortex_bench::conversions::list_to_vector_ext;
+use vortex_bench::conversions::parquet_to_vortex_chunks;
+use vortex_bench::datasets::Dataset;
+use vortex_bench::vector_dataset::VectorDataset;
+use vortex_btrblocks::BtrBlocksCompressor;
+use vortex_tensor::vector_search::build_similarity_search_tree;
+use vortex_tensor::vector_search::compress_turboquant;
+
+/// The threshold used when wrapping the similarity expression in a
+/// `Binary(Gt, [cosine, threshold])` filter. Set to a value high enough that random pairs
+/// from a ~1.0-norm distribution reject but self-query pairs match.
+pub const DEFAULT_THRESHOLD: f32 = 0.8;
+
+/// Row index used to pick a query vector from the dataset. Using a fixed row keeps queries
+/// reproducible across runs and guarantees at least one match (since `cosine(x, x) == 1.0`).
+pub const DEFAULT_QUERY_ROW: usize = 0;
+
+/// A single data-preparation strategy that the benchmark exercises.
+///
+/// Each variant corresponds to one column on the "format" axis in downstream dashboards. The
+/// `Format` mapping is what gets serialized into the `target.format` field of gh-json
+/// output.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)]
+pub enum Variant {
+    /// Raw `Vector<dim, f32>` with no encoding-level compression applied.
+    #[clap(name = "vortex-uncompressed")]
+    VortexUncompressed,
+    /// `BtrBlocksCompressor::default()` walks into the `Vector` extension and recursively
+    /// compresses the FSL storage child. This is the "generic lossless" Vortex story for
+    /// float vectors.
+    #[clap(name = "vortex-default")]
+    VortexDefault,
+    /// The full TurboQuant pipeline: `L2Denorm(SorfTransform(FSL(Dict)))`. Lossy; dramatic
+    /// size win; requires reporting recall alongside throughput for the comparison to be
+    /// honest. See [`vortex_tensor::vector_search::compress_turboquant`].
+    #[clap(name = "vortex-turboquant")]
+    VortexTurboQuant,
+}
+
+impl Variant {
+    /// The Format enum value this variant reports itself as in emitted measurements.
+    /// Uncompressed and BtrBlocks-default both surface as [`Format::OnDiskVortex`]; the
+    /// TurboQuant variant gets its own [`Format::VortexTurboQuant`] so dashboards can
+    /// distinguish them.
+    pub fn as_format(&self) -> Format {
+        match self {
+            Variant::VortexUncompressed => Format::OnDiskVortex,
+            Variant::VortexDefault => Format::OnDiskVortex,
+            Variant::VortexTurboQuant => Format::VortexTurboQuant,
+        }
+    }
+
+    /// A stable, kebab-cased label used in metric names so dashboards can split apart
+    /// variants that map to the same Format.
+    pub fn label(&self) -> &'static str {
+        match self {
+            Variant::VortexUncompressed => "vortex-uncompressed",
+            Variant::VortexDefault => "vortex-default",
+            Variant::VortexTurboQuant => "vortex-turboquant",
+        }
+    }
+}
+
+/// Number of rows in the query vector — matches the database so `ScalarFnArray`'s
+/// equal-length contract is satisfied. This type alias exists to make the broadcast
+/// semantics obvious at call sites.
+type QueryLen = usize;
+
+/// A materialized Vortex array and its associated execution session / context.
+pub struct PreparedDataset {
+    /// Name used in metric strings — usually the dataset's `Dataset::name()`.
+    pub name: String,
+    /// Uncompressed `Vector<dim, f32>` array (canonical form). This is reused as the
+    /// ground-truth basis for TurboQuant recall checks in future commits.
+    pub uncompressed: ArrayRef,
+    /// The query vector to use (a single row pulled from the dataset).
+    pub query: Vec<f32>,
+    /// Parquet file size on disk in bytes — produced by the dataset download step.
+    pub parquet_bytes: u64,
+}
+
+impl PreparedDataset {
+    /// Dimension of the underlying vector column.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `self.uncompressed` is not an `Extension<FixedSizeList<_, dim, _>>` —
+    /// which should be impossible because [`prepare_dataset`] is the only constructor
+    /// and it guarantees this shape.
+    pub fn dim(&self) -> u32 {
+        let fsl_dtype = match self.uncompressed.dtype() {
+            vortex::dtype::DType::Extension(ext) => ext.storage_dtype(),
+            other => {
+                vortex::error::vortex_panic!("expected Extension<Vector>, got {other}")
+            }
+        };
+        match fsl_dtype {
+            vortex::dtype::DType::FixedSizeList(_, dim, _) => *dim,
+            other => {
+                vortex::error::vortex_panic!("expected FixedSizeList storage, got {other}")
+            }
+        }
+    }
+
+    /// Number of rows in the uncompressed dataset.
+    pub fn num_rows(&self) -> usize {
+        self.uncompressed.len()
+    }
+}
+
+/// Prepare a dataset by downloading its parquet file, converting the `emb` column to a
+/// `Vector<dim, f32>` extension array, and extracting a single-row query vector.
+pub async fn prepare_dataset(dataset: &VectorDataset) -> Result<PreparedDataset> {
+    let parquet_path = dataset
+        .to_parquet_path()
+        .await
+        .context("download vector dataset parquet")?;
+    let parquet_bytes = std::fs::metadata(&parquet_path)
+        .with_context(|| format!("stat parquet file {:?}", parquet_path))?
+        .len();
+
+    tracing::info!(
+        "ingesting {} parquet from {:?} ({} bytes)",
+        dataset.name(),
+        parquet_path,
+        parquet_bytes
+    );
+
+    let chunked = parquet_to_vortex_chunks(parquet_path).await?;
+
+    let struct_array = chunked.into_array();
+    let emb_column = extract_emb_column(&struct_array)?;
+    let uncompressed = list_to_vector_ext(emb_column)?;
+
+    let query = extract_query_row(&uncompressed, DEFAULT_QUERY_ROW)?;
+
+    Ok(PreparedDataset {
+        name: dataset.name().to_string(),
+        uncompressed,
+        query,
+        parquet_bytes,
+    })
+}
+
+/// Project the `emb` column out of a chunked struct array. This rebuilds a chunked list
+/// array with just that one column.
+fn extract_emb_column(struct_array: &ArrayRef) -> Result<ArrayRef> {
+    use vortex::array::arrays::Chunked;
+    use vortex::array::arrays::ChunkedArray;
+    use vortex::array::arrays::Struct;
+    use vortex::array::arrays::chunked::ChunkedArrayExt;
+    use vortex::array::arrays::struct_::StructArrayExt as _;
+
+    if let Some(chunked) = struct_array.as_opt::<Chunked>() {
+        let mut emb_chunks: Vec<ArrayRef> = Vec::with_capacity(chunked.nchunks());
+        for chunk in chunked.iter_chunks() {
+            emb_chunks.push(extract_emb_column(chunk)?);
+        }
+        if emb_chunks.is_empty() {
+            bail!("dataset has no chunks");
+        }
+        return Ok(ChunkedArray::from_iter(emb_chunks).into_array());
+    }
+
+    let Some(struct_view) = struct_array.as_opt::<Struct>() else {
+        bail!(
+            "expected dataset chunks to be Struct arrays, got {}",
+            struct_array.dtype()
+        );
+    };
+
+    let field = struct_view
+        .unmasked_field_by_name("emb")
+        .context("dataset parquet must have an `emb` column")?;
+    Ok(field.clone())
+}
+
+/// Pull a single row out of a `Vector<dim, f32>` extension array as a plain `Vec<f32>`.
+fn extract_query_row(vector_ext: &ArrayRef, row: usize) -> Result<Vec<f32>> {
+    use vortex::array::arrays::Extension;
+    use vortex::array::arrays::extension::ExtensionArrayExt;
+    use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt;
+
+    let mut ctx = SESSION.create_execution_ctx();
+
+    let ext_view = vector_ext
+        .as_opt::<Extension>()
+        .context("prepared dataset must be a Vector extension array")?;
+
+    // Execute storage array to its canonical FSL form.
+    let fsl: FixedSizeListArray = ext_view.storage_array().clone().execute(&mut ctx)?;
+
+    let dim_usize = {
+        let vortex::dtype::DType::FixedSizeList(_, d, _) = fsl.dtype() else {
+            bail!("storage dtype must be FixedSizeList");
+        };
+        *d as usize
+    };
+
+    if row * dim_usize + dim_usize > vector_ext.len() * dim_usize {
+        bail!(
+            "query row {row} out of bounds for dataset of length {}",
+            vector_ext.len()
+        );
+    }
+
+    let elements: PrimitiveArray = fsl.elements().clone().execute(&mut ctx)?;
+    let slice = elements.as_slice::<f32>();
+    let start = row * dim_usize;
+    Ok(slice[start..start + dim_usize].to_vec())
+}
+
+/// Apply a `Variant`'s preparation strategy to the uncompressed Vortex array and return the
+/// prepared array together with its reported size in bytes. For serializable variants the
+/// size is the number of bytes written to a `.vortex` file; for in-memory-only variants
+/// (TurboQuant) it's the live `.nbytes()` footprint.
+pub async fn prepare_variant(
+    prepared: &PreparedDataset,
+    variant: Variant,
+    session: &VortexSession,
+) -> Result<(ArrayRef, u64)> {
+    match variant {
+        Variant::VortexUncompressed => {
+            let array = prepared.uncompressed.clone();
+            let size =
+                measure_on_disk_size(&array, session, &prepared.name, "uncompressed").await?;
+            Ok((array, size))
+        }
+        Variant::VortexDefault => {
+            let array = BtrBlocksCompressor::default().compress(&prepared.uncompressed)?;
+            let size = measure_on_disk_size(&array, session, &prepared.name, "default").await?;
+            Ok((array, size))
+        }
+        Variant::VortexTurboQuant => {
+            let mut ctx = session.create_execution_ctx();
+            let array = compress_turboquant(prepared.uncompressed.clone(), &mut ctx)?;
+            // TurboQuant cannot yet round-trip through a Vortex file (L2Denorm metadata
+            // serialization is not implemented). Report the in-memory `.nbytes()` footprint
+            // as a proxy. Document this in the benchmark output so consumers of the
+            // dashboard aren't misled.
+            let size = array.nbytes() as u64;
+            Ok((array, size))
+        }
+    }
+}
+
+/// Serialize a prepared Vortex array to a temporary `.vortex` file and return its length.
+/// This is what we report as the "compressed size" for serializable variants; it matches
+/// the semantics of `compress-bench` which reports the on-disk parquet/vortex file size.
+async fn measure_on_disk_size(
+    array: &ArrayRef,
+    session: &VortexSession,
+    dataset_name: &str,
+    variant_label: &str,
+) -> Result<u64> {
+    use vortex::file::WriteOptionsSessionExt;
+
+    let tmp_dir = std::env::temp_dir().join("vortex-vector-search-bench");
+    tokio::fs::create_dir_all(&tmp_dir).await?;
+    let tmp_path = tmp_dir.join(format!("{dataset_name}-{variant_label}.vortex"));
+
+    let mut file = tokio::fs::File::create(&tmp_path).await?;
+    session
+        .write_options()
+        .write(&mut file, array.clone().to_array_stream())
+        .await?;
+
+    let metadata = tokio::fs::metadata(&tmp_path).await?;
+    Ok(metadata.len())
+}
+
+/// Run the decode / cosine / filter microbenchmarks against a prepared variant array and
+/// return the best-of-`iterations` wall times for each measurement.
+pub fn run_timings(
+    variant_array: &ArrayRef,
+    query: &[f32],
+    iterations: usize,
+    session: &VortexSession,
+) -> Result<VariantTimings> {
+    let _ = QueryLen::default; // touch the type alias so rustc doesn't warn
+
+    let mut decode = Duration::MAX;
+    let mut cosine = Duration::MAX;
+    let mut filter = Duration::MAX;
+
+    for _ in 0..iterations {
+        let mut ctx = session.create_execution_ctx();
+        let start = Instant::now();
+        let decoded: FixedSizeListArray = decode_full_scan(variant_array, &mut ctx)?;
+        decode = decode.min(start.elapsed());
+        drop(decoded);
+    }
+
+    for _ in 0..iterations {
+        let mut ctx = session.create_execution_ctx();
+        let start = Instant::now();
+        let scores: PrimitiveArray = execute_cosine(variant_array, query, &mut ctx)?;
+        cosine = cosine.min(start.elapsed());
+        drop(scores);
+    }
+
+    for _ in 0..iterations {
+        let mut ctx = session.create_execution_ctx();
+        let start = Instant::now();
+        let matches: BoolArray = execute_filter(variant_array, query, DEFAULT_THRESHOLD, &mut ctx)?;
+        filter = filter.min(start.elapsed());
+        drop(matches);
+    }
+
+    Ok(VariantTimings {
+        decode,
+        cosine,
+        filter,
+    })
+}
+
+/// Timing summary for one `(dataset, variant)` pair.
+#[derive(Debug, Clone, Copy)]
+pub struct VariantTimings {
+    /// Wall time for a full column decode.
+    pub decode: Duration,
+    /// Wall time for the cosine_similarity scalar-function execution.
+    pub cosine: Duration,
+    /// Wall time for the full `Binary(Gt, [cosine, threshold])` expression.
+    pub filter: Duration,
+}
+
+/// Fully materialize the input column so the measurement captures *all* decompression
+/// work — the extension shell, the FSL storage, and the inner element buffer.
+///
+/// For the Vortex-uncompressed variant this is cheap (bitwise copy / no-op). For
+/// BtrBlocks-default it includes FSL decompression. For TurboQuant it includes running
+/// the inverse SORF rotation + dictionary lookup through the scalar-fn pipeline.
+fn decode_full_scan(
+    array: &ArrayRef,
+    ctx: &mut vortex::array::ExecutionCtx,
+) -> Result<FixedSizeListArray> {
+    use vortex::array::arrays::ExtensionArray;
+    use vortex::array::arrays::extension::ExtensionArrayExt;
+
+    let ext: ExtensionArray = array.clone().execute(ctx)?;
+    let fsl: FixedSizeListArray = ext.storage_array().clone().execute(ctx)?;
+    Ok(fsl)
+}
+
+fn execute_cosine(
+    data: &ArrayRef,
+    query: &[f32],
+    ctx: &mut vortex::array::ExecutionCtx,
+) -> Result<PrimitiveArray> {
+    use vortex_tensor::scalar_fns::cosine_similarity::CosineSimilarity;
+    use vortex_tensor::vector_search::build_constant_query_vector;
+
+    let num_rows = data.len();
+    let query_vec = build_constant_query_vector(query, num_rows)?;
+    let cosine = CosineSimilarity::try_new_array(data.clone(), query_vec, num_rows)
+        .vortex_expect("cosine similarity accepts matching Vector inputs")
+        .into_array();
+    Ok(cosine.execute(ctx)?)
+}
+
+fn execute_filter(
+    data: &ArrayRef,
+    query: &[f32],
+    threshold: f32,
+    ctx: &mut vortex::array::ExecutionCtx,
+) -> Result<BoolArray> {
+    let tree = build_similarity_search_tree(data.clone(), query, threshold)?;
+    Ok(tree.execute(ctx)?)
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex::array::IntoArray;
+    use vortex::array::arrays::ExtensionArray;
+    use vortex::array::arrays::FixedSizeListArray;
+    use vortex::array::arrays::PrimitiveArray;
+    use vortex::array::arrays::extension::ExtensionArrayExt;
+    use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt;
+    use vortex::array::extension::EmptyMetadata;
+    use vortex::array::validity::Validity;
+    use vortex::buffer::BufferMut;
+    use vortex::dtype::extension::ExtDType;
+    use vortex_bench::SESSION;
+    use vortex_tensor::vector::Vector;
+
+    use super::*;
+
+    fn synthetic_vector(dim: u32, num_rows: usize, seed: u64) -> ArrayRef {
+        let mut buf = BufferMut::<f32>::with_capacity(num_rows * dim as usize);
+        let mut state = seed;
+        for _ in 0..(num_rows * dim as usize) {
+            // Simple xorshift — deterministic, distribution not important for this test.
+            state ^= state << 13;
+            state ^= state >> 7;
+            state ^= state << 17;
+            let v = ((state & 0xFFFF) as f32 / 32768.0) - 1.0;
+            buf.push(v);
+        }
+        let elements = PrimitiveArray::new::<f32>(buf.freeze(), Validity::NonNullable).into_array();
+        let fsl =
+            FixedSizeListArray::try_new(elements, dim, Validity::NonNullable, num_rows).unwrap();
+        let ext_dtype = ExtDType::<Vector>::try_new(EmptyMetadata, fsl.dtype().clone())
+            .unwrap()
+            .erased();
+        ExtensionArray::new(ext_dtype, fsl.into_array()).into_array()
+    }
+
+    #[test]
+    fn prepare_variant_produces_non_empty_array_for_all_variants() {
+        let dim = 128u32;
+        let num_rows = 64usize;
+        let uncompressed = synthetic_vector(dim, num_rows, 0xC0FFEE);
+
+        let ext = uncompressed
+            .as_opt::<vortex::array::arrays::Extension>()
+            .unwrap();
+        let mut ctx = SESSION.create_execution_ctx();
+        let fsl: FixedSizeListArray = ext.storage_array().clone().execute(&mut ctx).unwrap();
+        let elements: PrimitiveArray = fsl.elements().clone().execute(&mut ctx).unwrap();
+        let slice = elements.as_slice::<f32>();
+        let query = slice[..dim as usize].to_vec();
+
+        let prepared = PreparedDataset {
+            name: "synthetic".to_string(),
+            uncompressed: uncompressed.clone(),
+            query,
+            parquet_bytes: 0,
+        };
+
+        for variant in [
+            Variant::VortexUncompressed,
+            Variant::VortexDefault,
+            Variant::VortexTurboQuant,
+        ] {
+            let rt = tokio::runtime::Builder::new_current_thread()
+                .enable_all()
+                .build()
+                .unwrap();
+            let (array, size) = rt
+                .block_on(prepare_variant(&prepared, variant, &SESSION))
+                .unwrap();
+            assert_eq!(
+                array.len(),
+                num_rows,
+                "variant {variant:?} changed row count"
+            );
+            assert!(size > 0, "variant {variant:?} reported zero size");
+
+            let timings = run_timings(&array, &prepared.query, 2, &SESSION).unwrap();
+            assert!(timings.decode > Duration::ZERO);
+            assert!(timings.cosine > Duration::ZERO);
+            assert!(timings.filter > Duration::ZERO);
+        }
+    }
+}
diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs
new file mode 100644
index 00000000000..74f4c740f6b
--- /dev/null
+++ b/benchmarks/vector-search-bench/src/main.rs
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! `vector-search-bench` — brute-force cosine-similarity benchmark over public VectorDBBench
+//! embedding corpora.
+//!
+//! Usage:
+//!
+//! ```bash
+//! cargo run -p vector-search-bench --release -- \
+//!     --datasets cohere-small \
+//!     --variants vortex-uncompressed,vortex-default,vortex-turboquant \
+//!     --iterations 5 \
+//!     -d table
+//! ```
+
+use std::borrow::Cow;
+use std::path::PathBuf;
+
+use anyhow::Result;
+use clap::Parser;
+use indicatif::ProgressBar;
+use vector_search_bench::Variant;
+use vector_search_bench::prepare_dataset;
+use vector_search_bench::prepare_variant;
+use vector_search_bench::run_timings;
+use vortex_bench::SESSION;
+use vortex_bench::create_output_writer;
+use vortex_bench::display::DisplayFormat;
+use vortex_bench::display::print_measurements_json;
+use vortex_bench::measurements::CompressionTimingMeasurement;
+use vortex_bench::measurements::CustomUnitMeasurement;
+use vortex_bench::setup_logging_and_tracing;
+use vortex_bench::vector_dataset::VectorDataset;
+
+const BENCHMARK_ID: &str = "vector-search";
+
+/// Command-line arguments for `vector-search-bench`.
+#[derive(Parser, Debug)]
+#[command(version, about, long_about = None)]
+struct Args {
+    /// Number of timed iterations per measurement. The reported time is the minimum across
+    /// iterations (matches compress-bench convention).
+    #[arg(short, long, default_value_t = 5)]
+    iterations: usize,
+
+    /// Subset of datasets to run. Defaults to Cohere-small.
+    #[arg(long, value_delimiter = ',', value_enum, default_values_t = vec![SelectableDataset::CohereSmall])]
+    datasets: Vec<SelectableDataset>,
+
+    /// Subset of variants to exercise. Defaults to all three Vortex variants.
+    #[arg(long, value_delimiter = ',', value_enum, default_values_t = vec![Variant::VortexUncompressed, Variant::VortexDefault, Variant::VortexTurboQuant])]
+    variants: Vec<Variant>,
+
+    /// Output display format (`table` for humans, `gh-json` for CI ingestion).
+    #[arg(short, long, default_value_t, value_enum)]
+    display_format: DisplayFormat,
+
+    /// If set, write output to this file instead of stdout.
+    #[arg(short, long)]
+    output_path: Option<PathBuf>,
+
+    /// Verbose logging.
+    #[arg(short, long)]
+    verbose: bool,
+
+    /// Enable perfetto tracing output.
+    #[arg(long)]
+    tracing: bool,
+}
+
+#[derive(clap::ValueEnum, Clone, Copy, Debug, PartialEq, Eq)]
+enum SelectableDataset {
+    #[clap(name = "cohere-small")]
+    CohereSmall,
+}
+
+impl SelectableDataset {
+    fn into_dataset(self) -> VectorDataset {
+        match self {
+            SelectableDataset::CohereSmall => VectorDataset::CohereSmall,
+        }
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let args = Args::parse();
+    setup_logging_and_tracing(args.verbose, args.tracing)?;
+
+    let datasets: Vec<VectorDataset> = args
+        .datasets
+        .iter()
+        .copied()
+        .map(SelectableDataset::into_dataset)
+        .collect();
+
+    let total_work = datasets.len() * args.variants.len();
+    let progress = ProgressBar::new(total_work as u64);
+
+    let mut timings: Vec<CompressionTimingMeasurement> = Vec::new();
+    let mut sizes: Vec<CustomUnitMeasurement> = Vec::new();
+
+    for dataset in &datasets {
+        let prepared = prepare_dataset(dataset).await?;
+        tracing::info!(
+            "prepared {}: dim={}, num_rows={}",
+            prepared.name,
+            prepared.dim(),
+            prepared.num_rows()
+        );
+
+        for &variant in &args.variants {
+            let (variant_array, size_bytes) = prepare_variant(&prepared, variant, &SESSION).await?;
+
+            let variant_label = variant.label();
+            let bench_name = format!("{variant_label}/{}", prepared.name);
+
+            sizes.push(CustomUnitMeasurement {
+                name: format!("{variant_label} size/{}", prepared.name),
+                format: variant.as_format(),
+                unit: Cow::from("bytes"),
+                value: size_bytes as f64,
+            });
+
+            let variant_timings =
+                run_timings(&variant_array, &prepared.query, args.iterations, &SESSION)?;
+
+            timings.push(CompressionTimingMeasurement {
+                name: format!("decode time/{bench_name}"),
+                format: variant.as_format(),
+                time: variant_timings.decode,
+            });
+            timings.push(CompressionTimingMeasurement {
+                name: format!("cosine-similarity time/{bench_name}"),
+                format: variant.as_format(),
+                time: variant_timings.cosine,
+            });
+            timings.push(CompressionTimingMeasurement {
+                name: format!("cosine-filter time/{bench_name}"),
+                format: variant.as_format(),
+                time: variant_timings.filter,
+            });
+
+            progress.inc(1);
+        }
+    }
+    progress.finish();
+
+    let mut writer = create_output_writer(&args.display_format, args.output_path, BENCHMARK_ID)?;
+    match args.display_format {
+        DisplayFormat::Table => {
+            // Our variants span multiple `Format` values *and* multiple labels that share a
+            // single `Format`, so the existing `render_table` helper (which groups by
+            // `Target`) would collapse them. Emit one line per measurement instead; this is
+            // only used for developer inspection — CI consumes `gh-json` via the arm below.
+            for timing in &timings {
+                writeln!(writer, "{} {} ns", timing.name, timing.time.as_nanos())?;
+            }
+            for size in &sizes {
+                writeln!(writer, "{} {} {}", size.name, size.value, size.unit)?;
+            }
+        }
+        DisplayFormat::GhJson => {
+            print_measurements_json(&mut writer, timings)?;
+            print_measurements_json(&mut writer, sizes)?;
+        }
+    }
+
+    Ok(())
+}
+
+use std::io::Write;

From bc3861f28ee849e640ef5ccd2b51fcaff6cd69d3 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 11 Apr 2026 13:00:42 +0000
Subject: [PATCH 05/18] vector-search-bench: add parquet-Arrow baseline and
 Recall@10 measurement
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This completes the v1 scope of the vector-search benchmark by adding
two orthogonal pieces of functionality:

1. **Parquet-Arrow hand-rolled cosine baseline** (`parquet_baseline.rs`)
   reads the canonical parquet file via `parquet::arrow`, decodes the
   `emb` column to a flat `Vec<f32>`, and runs a straightforward Rust
   cosine loop. This is the "what you'd do without Vortex" external
   floor — necessary for the benchmark to tell a credible story about
   Vortex's value vs. raw parquet. Reports size, decode time, cosine
   time, and filter time in the same format as the Vortex variants,
   under `Format::Parquet`. Handles both `List<f32>` and
   `FixedSizeList<f32>` parquet schemas.

2. **Recall@K quality measurement** (`recall.rs`) computes the fraction
   of the exact top-K nearest neighbours that the lossy TurboQuant
   variant recovers, using the uncompressed Vortex scan as local ground
   truth (not VectorDBBench's shipped `neighbors.parquet`, which would
   require an index and is out of scope). Samples are deterministic —
   rows picked at uniform intervals across the dataset — so results are
   stable across runs. Only TurboQuant is checked; lossless variants
   are trivially 1.0. Emitted as a `CustomUnitMeasurement` with unit
   "recall" so it flows through the existing gh-json pipeline alongside
   the size/timing measurements.

The binary picks this all up behind two new CLI flags: `--parquet-baseline`
(default true) and `--recall-queries N` (default 100), plus `--recall-k K`
(default 10). Four unit tests cover the new functionality:
- `parquet_baseline_reads_fsl_column` — end-to-end parquet write +
  read + cosine loop on a 3×3 fixture.
- `uncompressed_has_perfect_self_recall` — sanity check that recall@10
  of a variant against itself is exactly 1.0.
- `turboquant_recall_is_reasonable_for_synthetic_data` — loose lower
  bound (>= 0.3) on TurboQuant recall for a 64×128 random dataset to
  catch total regressions without being flaky.

A small `test_utils` submodule was extracted from the lib's existing
tests module so the recall tests can reuse the synthetic vector
generator.

Signed-off-by: Claude <noreply@anthropic.com>
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 Cargo.lock                                    |   4 +
 benchmarks/vector-search-bench/Cargo.toml     |   6 +
 benchmarks/vector-search-bench/src/lib.rs     |  30 +-
 benchmarks/vector-search-bench/src/main.rs    |  88 +++++
 .../src/parquet_baseline.rs                   | 303 ++++++++++++++++++
 benchmarks/vector-search-bench/src/recall.rs  | 188 +++++++++++
 6 files changed, 611 insertions(+), 8 deletions(-)
 create mode 100644 benchmarks/vector-search-bench/src/parquet_baseline.rs
 create mode 100644 benchmarks/vector-search-bench/src/recall.rs

diff --git a/Cargo.lock b/Cargo.lock
index f192874ce88..c56f8e5330e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10057,10 +10057,14 @@ name = "vector-search-bench"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "arrow-array 58.0.0",
+ "arrow-schema 58.0.0",
  "async-trait",
  "clap",
  "indicatif",
  "itertools 0.14.0",
+ "parquet 58.0.0",
+ "tempfile",
  "tokio",
  "tracing",
  "vortex",
diff --git a/benchmarks/vector-search-bench/Cargo.toml b/benchmarks/vector-search-bench/Cargo.toml
index f881a5542b1..068cf400d71 100644
--- a/benchmarks/vector-search-bench/Cargo.toml
+++ b/benchmarks/vector-search-bench/Cargo.toml
@@ -16,10 +16,13 @@ publish = false
 
 [dependencies]
 anyhow = { workspace = true }
+arrow-array = { workspace = true }
+arrow-schema = { workspace = true }
 async-trait = { workspace = true }
 clap = { workspace = true, features = ["derive"] }
 indicatif = { workspace = true }
 itertools = { workspace = true }
+parquet = { workspace = true }
 tokio = { workspace = true, features = ["full"] }
 tracing = { workspace = true }
 vortex = { workspace = true }
@@ -27,5 +30,8 @@ vortex-bench = { workspace = true }
 vortex-btrblocks = { workspace = true }
 vortex-tensor = { workspace = true }
 
+[dev-dependencies]
+tempfile = { workspace = true }
+
 [lints]
 workspace = true
diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs
index 18356c8ee86..27f61f39ed7 100644
--- a/benchmarks/vector-search-bench/src/lib.rs
+++ b/benchmarks/vector-search-bench/src/lib.rs
@@ -22,6 +22,9 @@
 use std::time::Duration;
 use std::time::Instant;
 
+pub mod parquet_baseline;
+pub mod recall;
+
 use anyhow::Context;
 use anyhow::Result;
 use anyhow::bail;
@@ -407,28 +410,27 @@ fn execute_filter(
     Ok(tree.execute(ctx)?)
 }
 
+/// Test-only helpers shared between the unit tests in this crate's submodules.
 #[cfg(test)]
-mod tests {
+pub(crate) mod test_utils {
+    use vortex::array::ArrayRef;
     use vortex::array::IntoArray;
     use vortex::array::arrays::ExtensionArray;
     use vortex::array::arrays::FixedSizeListArray;
     use vortex::array::arrays::PrimitiveArray;
-    use vortex::array::arrays::extension::ExtensionArrayExt;
-    use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt;
     use vortex::array::extension::EmptyMetadata;
     use vortex::array::validity::Validity;
     use vortex::buffer::BufferMut;
     use vortex::dtype::extension::ExtDType;
-    use vortex_bench::SESSION;
     use vortex_tensor::vector::Vector;
 
-    use super::*;
-
-    fn synthetic_vector(dim: u32, num_rows: usize, seed: u64) -> ArrayRef {
+    /// Build a deterministic `Vector<dim, f32>` extension array of `num_rows` rows for
+    /// tests. The PRNG is a trivial xorshift keyed by `seed`; we don't care about the
+    /// distribution beyond "not all zeros".
+    pub fn synthetic_vector(dim: u32, num_rows: usize, seed: u64) -> ArrayRef {
         let mut buf = BufferMut::<f32>::with_capacity(num_rows * dim as usize);
         let mut state = seed;
         for _ in 0..(num_rows * dim as usize) {
-            // Simple xorshift — deterministic, distribution not important for this test.
             state ^= state << 13;
             state ^= state >> 7;
             state ^= state << 17;
@@ -443,6 +445,18 @@ mod tests {
             .erased();
         ExtensionArray::new(ext_dtype, fsl.into_array()).into_array()
     }
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex::array::arrays::FixedSizeListArray;
+    use vortex::array::arrays::PrimitiveArray;
+    use vortex::array::arrays::extension::ExtensionArrayExt;
+    use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt;
+    use vortex_bench::SESSION;
+
+    use super::test_utils::synthetic_vector;
+    use super::*;
 
     #[test]
     fn prepare_variant_produces_non_empty_array_for_all_variants() {
diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs
index 74f4c740f6b..5a5ef57a742 100644
--- a/benchmarks/vector-search-bench/src/main.rs
+++ b/benchmarks/vector-search-bench/src/main.rs
@@ -20,12 +20,18 @@ use std::path::PathBuf;
 use anyhow::Result;
 use clap::Parser;
 use indicatif::ProgressBar;
+use vector_search_bench::DEFAULT_THRESHOLD;
 use vector_search_bench::Variant;
+use vector_search_bench::parquet_baseline::run_parquet_baseline_timings;
 use vector_search_bench::prepare_dataset;
 use vector_search_bench::prepare_variant;
+use vector_search_bench::recall::DEFAULT_TOP_K;
+use vector_search_bench::recall::measure_recall_at_k;
 use vector_search_bench::run_timings;
+use vortex_bench::Format;
 use vortex_bench::SESSION;
 use vortex_bench::create_output_writer;
+use vortex_bench::datasets::Dataset;
 use vortex_bench::display::DisplayFormat;
 use vortex_bench::display::print_measurements_json;
 use vortex_bench::measurements::CompressionTimingMeasurement;
@@ -52,6 +58,20 @@ struct Args {
     #[arg(long, value_delimiter = ',', value_enum, default_values_t = vec![Variant::VortexUncompressed, Variant::VortexDefault, Variant::VortexTurboQuant])]
     variants: Vec<Variant>,
 
+    /// Also run the Parquet-Arrow hand-rolled cosine baseline as an additional variant.
+    /// Default `true` — disable only when you intentionally want a Vortex-only comparison.
+    #[arg(long, default_value_t = true)]
+    parquet_baseline: bool,
+
+    /// Number of query rows sampled when computing Recall@K for TurboQuant. 0 disables
+    /// the quality measurement entirely (useful for smoke tests).
+    #[arg(long, default_value_t = 100)]
+    recall_queries: usize,
+
+    /// K in Recall@K. Defaults to 10, matching VectorDBBench conventions.
+    #[arg(long, default_value_t = DEFAULT_TOP_K)]
+    recall_k: usize,
+
     /// Output display format (`table` for humans, `gh-json` for CI ingestion).
     #[arg(short, long, default_value_t, value_enum)]
     display_format: DisplayFormat,
@@ -101,6 +121,8 @@ async fn main() -> Result<()> {
     let mut timings: Vec<CompressionTimingMeasurement> = Vec::new();
     let mut sizes: Vec<CustomUnitMeasurement> = Vec::new();
 
+    let mut recalls: Vec<CustomUnitMeasurement> = Vec::new();
+
     for dataset in &datasets {
         let prepared = prepare_dataset(dataset).await?;
         tracing::info!(
@@ -110,6 +132,44 @@ async fn main() -> Result<()> {
             prepared.num_rows()
         );
 
+        // Parquet-Arrow baseline. Emitted as a separate pseudo-variant with label
+        // `parquet` / Format::Parquet so it shows up in dashboards next to the Vortex
+        // variants.
+        if args.parquet_baseline {
+            let parquet_path = dataset.to_parquet_path().await?;
+            let baseline_timings = run_parquet_baseline_timings(
+                &parquet_path,
+                &prepared.query,
+                DEFAULT_THRESHOLD,
+                args.iterations,
+            )?;
+
+            let label = "parquet";
+            let bench_name = format!("{label}/{}", prepared.name);
+
+            sizes.push(CustomUnitMeasurement {
+                name: format!("{label} size/{}", prepared.name),
+                format: Format::Parquet,
+                unit: Cow::from("bytes"),
+                value: prepared.parquet_bytes as f64,
+            });
+            timings.push(CompressionTimingMeasurement {
+                name: format!("decode time/{bench_name}"),
+                format: Format::Parquet,
+                time: baseline_timings.decode,
+            });
+            timings.push(CompressionTimingMeasurement {
+                name: format!("cosine-similarity time/{bench_name}"),
+                format: Format::Parquet,
+                time: baseline_timings.cosine,
+            });
+            timings.push(CompressionTimingMeasurement {
+                name: format!("cosine-filter time/{bench_name}"),
+                format: Format::Parquet,
+                time: baseline_timings.filter,
+            });
+        }
+
         for &variant in &args.variants {
             let (variant_array, size_bytes) = prepare_variant(&prepared, variant, &SESSION).await?;
 
@@ -142,6 +202,26 @@ async fn main() -> Result<()> {
                 time: variant_timings.filter,
             });
 
+            // Recall@K quality measurement for lossy variants only. The lossless
+            // variants (uncompressed + BtrBlocks default) are trivially 1.0 against
+            // the uncompressed ground truth, so we skip them to avoid noise.
+            if args.recall_queries > 0 && variant == Variant::VortexTurboQuant {
+                let recall = measure_recall_at_k(
+                    &prepared.uncompressed,
+                    &variant_array,
+                    args.recall_queries,
+                    args.recall_k,
+                    &SESSION,
+                )?;
+                tracing::info!("Recall@{} for {}: {:.4}", args.recall_k, bench_name, recall);
+                recalls.push(CustomUnitMeasurement {
+                    name: format!("recall@{}/{bench_name}", args.recall_k),
+                    format: variant.as_format(),
+                    unit: Cow::from("recall"),
+                    value: recall,
+                });
+            }
+
             progress.inc(1);
         }
     }
@@ -160,10 +240,18 @@ async fn main() -> Result<()> {
             for size in &sizes {
                 writeln!(writer, "{} {} {}", size.name, size.value, size.unit)?;
             }
+            for recall in &recalls {
+                writeln!(
+                    writer,
+                    "{} {:.4} {}",
+                    recall.name, recall.value, recall.unit
+                )?;
+            }
         }
         DisplayFormat::GhJson => {
             print_measurements_json(&mut writer, timings)?;
             print_measurements_json(&mut writer, sizes)?;
+            print_measurements_json(&mut writer, recalls)?;
         }
     }
 
diff --git a/benchmarks/vector-search-bench/src/parquet_baseline.rs b/benchmarks/vector-search-bench/src/parquet_baseline.rs
new file mode 100644
index 00000000000..b80a2308370
--- /dev/null
+++ b/benchmarks/vector-search-bench/src/parquet_baseline.rs
@@ -0,0 +1,303 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Parquet-Arrow hand-rolled cosine similarity baseline.
+//!
+//! This module provides the "what you'd do without Vortex" floor for the vector-search
+//! benchmark. It reads the canonical parquet file for a dataset via `parquet::arrow`,
+//! decodes the `emb` column to an Arrow `FixedSizeListArray<f32>`, and then runs a
+//! straightforward Rust cosine-similarity loop — no scalar functions, no lazy expressions,
+//! no index.
+//!
+//! The four measurements produced mirror those of the Vortex variants so dashboards can
+//! put the parquet bar right next to the vortex bars:
+//!
+//! 1. Compressed size — the on-disk parquet file in bytes.
+//! 2. Full-scan decode time — parquet → arrow record batches → concatenated
+//!    `FixedSizeListArray<f32>`.
+//! 3. Cosine-similarity execute time — hand-rolled loop producing a `Vec<f32>` of scores.
+//! 4. Filter execute time — the same loop materializing into a `Vec<bool>` where
+//!    `score > threshold`.
+//!
+//! This module does *not* include the parquet decode time in the cosine/filter wall
+//! times. Decoding is treated as its own measurement. This matches how the Vortex variants
+//! separate decode from compute.
+
+use std::fs::File;
+use std::path::Path;
+use std::time::Duration;
+use std::time::Instant;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::bail;
+use arrow_array::Array;
+use arrow_array::FixedSizeListArray;
+use arrow_array::Float32Array;
+use arrow_array::ListArray;
+use arrow_array::RecordBatch;
+use arrow_array::cast::AsArray;
+use arrow_schema::DataType;
+use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+
+use crate::VariantTimings;
+
+/// Read the entire `emb` column of a parquet file into a single flat `Vec<f32>`, along
+/// with the dimension and row count.
+pub fn read_parquet_embedding_column(parquet_path: &Path) -> Result<ParquetBaselineData> {
+    let file = File::open(parquet_path)
+        .with_context(|| format!("open parquet file {}", parquet_path.display()))?;
+    let file_size = file.metadata()?.len();
+    let builder = ParquetRecordBatchReaderBuilder::try_new(file)?;
+
+    // Locate the `emb` column and sanity-check its type.
+    let (emb_idx, emb_field) = builder
+        .schema()
+        .column_with_name("emb")
+        .context("parquet schema missing `emb` column")?;
+
+    // VectorDBBench parquet files use `list<float>`; some others use `fixed_size_list`.
+    // Both need to be supported — the canonical parquet emit from arrow-rs is `list<f32>`
+    // since parquet has no fixed-size-list logical type.
+    let element_dtype = match emb_field.data_type() {
+        DataType::List(field) | DataType::LargeList(field) | DataType::FixedSizeList(field, _) => {
+            field.data_type().clone()
+        }
+        other => bail!("emb column must be a list of float, got {other:?}"),
+    };
+    if !matches!(element_dtype, DataType::Float32) {
+        bail!(
+            "emb column element type must be Float32, got {:?}",
+            element_dtype
+        );
+    }
+    let _ = emb_idx;
+
+    let reader = builder.build()?;
+    let batches: Vec<RecordBatch> = reader.collect::<Result<Vec<_>, _>>()?;
+
+    let mut data = Vec::<f32>::new();
+    let mut num_rows = 0usize;
+    let mut inferred_dim: Option<usize> = None;
+
+    for batch in batches.iter() {
+        let column = batch
+            .column_by_name("emb")
+            .context("emb column missing from record batch")?;
+        append_batch(column, &mut data, &mut inferred_dim, &mut num_rows)?;
+    }
+
+    let dim = inferred_dim.context("parquet file has zero rows — cannot infer dimension")?;
+    Ok(ParquetBaselineData {
+        elements: data,
+        dim,
+        num_rows,
+        file_size,
+    })
+}
+
+fn append_batch(
+    column: &dyn Array,
+    data: &mut Vec<f32>,
+    inferred_dim: &mut Option<usize>,
+    num_rows: &mut usize,
+) -> Result<()> {
+    if let Some(fsl) = column.as_any().downcast_ref::<FixedSizeListArray>() {
+        let dim = fsl.value_length() as usize;
+        maybe_set_dim(inferred_dim, dim)?;
+        let values = fsl
+            .values()
+            .as_any()
+            .downcast_ref::<Float32Array>()
+            .context("FSL emb column must have Float32 values")?;
+        data.extend_from_slice(values.values());
+        *num_rows += fsl.len();
+        return Ok(());
+    }
+
+    if let Some(list) = column.as_any().downcast_ref::<ListArray>() {
+        let values: &Float32Array = list
+            .values()
+            .as_primitive_opt::<arrow_array::types::Float32Type>()
+            .context("List emb column must have Float32 values")?;
+        let offsets = list.value_offsets();
+        for i in 0..list.len() {
+            let start = offsets[i] as usize;
+            let end = offsets[i + 1] as usize;
+            let row_len = end - start;
+            maybe_set_dim(inferred_dim, row_len)?;
+            data.extend_from_slice(&values.values()[start..end]);
+            *num_rows += 1;
+        }
+        return Ok(());
+    }
+
+    bail!(
+        "emb column has unsupported arrow type {:?}",
+        column.data_type()
+    );
+}
+
+fn maybe_set_dim(inferred_dim: &mut Option<usize>, new_dim: usize) -> Result<()> {
+    match inferred_dim {
+        Some(d) if *d == new_dim => Ok(()),
+        Some(d) => bail!("inconsistent emb dimensions: saw {d} then {new_dim}"),
+        None if new_dim == 0 => bail!("emb row has zero elements"),
+        None => {
+            *inferred_dim = Some(new_dim);
+            Ok(())
+        }
+    }
+}
+
+/// The flattened representation of a parquet file's embedding column, suitable for a
+/// hand-rolled distance loop.
+pub struct ParquetBaselineData {
+    /// All rows concatenated: `elements.len() == num_rows * dim`.
+    pub elements: Vec<f32>,
+    /// Vector dimensionality.
+    pub dim: usize,
+    /// Number of rows.
+    pub num_rows: usize,
+    /// On-disk size of the parquet file in bytes.
+    pub file_size: u64,
+}
+
+/// Run the decode / cosine / filter baseline microbenchmarks and return the best-of-N
+/// wall times. Decoding is re-parquet-reading from disk on each iteration (matches how
+/// the Vortex variants also re-execute from scratch each iteration).
+pub fn run_parquet_baseline_timings(
+    parquet_path: &Path,
+    query: &[f32],
+    threshold: f32,
+    iterations: usize,
+) -> Result<VariantTimings> {
+    let mut decode = Duration::MAX;
+    let mut cosine = Duration::MAX;
+    let mut filter = Duration::MAX;
+
+    for _ in 0..iterations {
+        let start = Instant::now();
+        let data = read_parquet_embedding_column(parquet_path)?;
+        decode = decode.min(start.elapsed());
+
+        let start = Instant::now();
+        let scores = cosine_loop(&data.elements, data.num_rows, data.dim, query);
+        cosine = cosine.min(start.elapsed());
+        debug_assert_eq!(scores.len(), data.num_rows);
+
+        let start = Instant::now();
+        let matches = filter_loop(&scores, threshold);
+        filter = filter.min(start.elapsed());
+        debug_assert_eq!(matches.len(), data.num_rows);
+    }
+
+    Ok(VariantTimings {
+        decode,
+        cosine,
+        filter,
+    })
+}
+
+/// Compute cosine similarity for every row against `query`. The query is assumed to match
+/// the database vectors' dimension. Returns one f32 score per row; scores for zero-norm
+/// rows or a zero-norm query are 0.0 by convention.
+pub fn cosine_loop(elements: &[f32], num_rows: usize, dim: usize, query: &[f32]) -> Vec<f32> {
+    assert_eq!(query.len(), dim);
+    assert_eq!(elements.len(), num_rows * dim);
+
+    let query_norm = query.iter().map(|&q| q * q).sum::<f32>().sqrt();
+    let mut out = Vec::with_capacity(num_rows);
+    if query_norm == 0.0 {
+        out.resize(num_rows, 0.0);
+        return out;
+    }
+
+    for row in 0..num_rows {
+        let base = row * dim;
+        let slice = &elements[base..base + dim];
+        let mut dot = 0.0f32;
+        let mut sq = 0.0f32;
+        for i in 0..dim {
+            dot += slice[i] * query[i];
+            sq += slice[i] * slice[i];
+        }
+        let norm = sq.sqrt();
+        if norm == 0.0 {
+            out.push(0.0);
+        } else {
+            out.push(dot / (norm * query_norm));
+        }
+    }
+    out
+}
+
+/// Build the `cosine > threshold` boolean mask.
+pub fn filter_loop(scores: &[f32], threshold: f32) -> Vec<bool> {
+    scores.iter().map(|&s| s > threshold).collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use std::fs::File;
+    use std::sync::Arc;
+
+    use arrow_array::RecordBatch;
+    use arrow_array::builder::FixedSizeListBuilder;
+    use arrow_array::builder::Float32Builder;
+    use arrow_schema::DataType;
+    use arrow_schema::Field;
+    use arrow_schema::Schema;
+    use parquet::arrow::ArrowWriter;
+    use tempfile::NamedTempFile;
+
+    use super::*;
+
+    /// Build a minimal parquet file with an `emb: FixedSizeList<f32, dim>` column and
+    /// verify the baseline pipeline produces the expected scores.
+    fn write_tiny_fsl_parquet(dim: i32, rows: &[&[f32]]) -> Result<NamedTempFile> {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "emb",
+            DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), dim),
+            false,
+        )]));
+
+        let file = NamedTempFile::new()?;
+        let mut writer =
+            ArrowWriter::try_new(File::create(file.path())?, Arc::clone(&schema), None)?;
+
+        let dim_usize = usize::try_from(dim).unwrap();
+        let mut builder = FixedSizeListBuilder::new(Float32Builder::new(), dim);
+        for row in rows {
+            assert_eq!(row.len(), dim_usize);
+            for &v in row.iter() {
+                builder.values().append_value(v);
+            }
+            builder.append(true);
+        }
+        let array = builder.finish();
+        let batch = RecordBatch::try_new(schema, vec![Arc::new(array)])?;
+        writer.write(&batch)?;
+        writer.close()?;
+        Ok(file)
+    }
+
+    #[test]
+    fn parquet_baseline_reads_fsl_column() {
+        let file =
+            write_tiny_fsl_parquet(3, &[&[1.0, 0.0, 0.0], &[0.0, 1.0, 0.0], &[1.0, 0.0, 0.0]])
+                .unwrap();
+
+        let data = read_parquet_embedding_column(file.path()).unwrap();
+        assert_eq!(data.dim, 3);
+        assert_eq!(data.num_rows, 3);
+        assert_eq!(data.elements.len(), 9);
+
+        let query = [1.0f32, 0.0, 0.0];
+        let scores = cosine_loop(&data.elements, data.num_rows, data.dim, &query);
+        assert_eq!(scores, vec![1.0, 0.0, 1.0]);
+
+        let mask = filter_loop(&scores, 0.5);
+        assert_eq!(mask, vec![true, false, true]);
+    }
+}
diff --git a/benchmarks/vector-search-bench/src/recall.rs b/benchmarks/vector-search-bench/src/recall.rs
new file mode 100644
index 00000000000..15132561b4d
--- /dev/null
+++ b/benchmarks/vector-search-bench/src/recall.rs
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Recall@K quality measurement for lossy vector-search variants.
+//!
+//! This module computes the fraction of the true top-K nearest neighbours that a
+//! lossy encoding (today just TurboQuant) recovers, using the uncompressed Vortex
+//! scan as the local ground truth. Recall is averaged over a small number of sampled
+//! query rows.
+//!
+//! This is explicitly a *relative* recall — we compare TurboQuant-retrieved neighbours
+//! against the neighbours that the *same* cosine-similarity expression finds in the
+//! uncompressed scan, not against VectorDBBench's shipped `neighbors.parquet`. Comparing
+//! against external ground truth would require an index (which Vortex doesn't have) and
+//! is structurally out of scope for a file-format benchmark.
+
+use anyhow::Result;
+use vortex::array::ArrayRef;
+use vortex::array::IntoArray;
+use vortex::array::VortexSessionExecute;
+use vortex::array::arrays::PrimitiveArray;
+use vortex::array::arrays::extension::ExtensionArrayExt;
+use vortex::error::VortexExpect;
+use vortex::session::VortexSession;
+use vortex::utils::aliases::hash_set::HashSet;
+use vortex_tensor::scalar_fns::cosine_similarity::CosineSimilarity;
+use vortex_tensor::vector_search::build_constant_query_vector;
+
+/// Size of the neighbour set we compare. 10 is the standard VectorDBBench default.
+pub const DEFAULT_TOP_K: usize = 10;
+
+/// Compute recall@K for the lossy `compressed` variant against the `uncompressed`
+/// ground-truth variant, averaged over `num_queries` sampled query rows.
+///
+/// Query selection is deterministic: rows are picked uniformly across the dataset at
+/// `step = uncompressed.len() / num_queries` intervals. This keeps the result stable
+/// across runs and avoids needing to thread a PRNG seed into the benchmark CLI.
+pub fn measure_recall_at_k(
+    uncompressed: &ArrayRef,
+    compressed: &ArrayRef,
+    num_queries: usize,
+    top_k: usize,
+    session: &VortexSession,
+) -> Result<f64> {
+    assert!(
+        num_queries > 0,
+        "measure_recall_at_k requires num_queries > 0"
+    );
+    assert!(top_k > 0, "measure_recall_at_k requires top_k > 0");
+    let num_rows = uncompressed.len();
+    assert_eq!(
+        compressed.len(),
+        num_rows,
+        "uncompressed and compressed arrays must have the same row count"
+    );
+    assert!(num_rows >= top_k, "dataset must have at least top_k rows");
+
+    let step = (num_rows / num_queries).max(1);
+
+    let mut total_hits: usize = 0;
+    let mut total_checked: usize = 0;
+
+    for q in 0..num_queries {
+        let row = (q * step).min(num_rows - 1);
+        let query = extract_query_row(uncompressed, row, session)?;
+
+        let gt_scores = score_all_rows(uncompressed, &query, session)?;
+        let truth = top_k_indices(&gt_scores, top_k);
+
+        let lossy_scores = score_all_rows(compressed, &query, session)?;
+        let lossy = top_k_indices(&lossy_scores, top_k);
+
+        let truth_set: HashSet<usize> = truth.iter().copied().collect();
+        total_hits += lossy.iter().filter(|idx| truth_set.contains(*idx)).count();
+        total_checked += top_k;
+    }
+
+    Ok(total_hits as f64 / total_checked as f64)
+}
+
+fn extract_query_row(
+    vector_ext: &ArrayRef,
+    row: usize,
+    session: &VortexSession,
+) -> Result<Vec<f32>> {
+    use anyhow::Context;
+    use vortex::array::arrays::Extension;
+    use vortex::array::arrays::FixedSizeListArray;
+    use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt;
+
+    let mut ctx = session.create_execution_ctx();
+    let ext = vector_ext
+        .as_opt::<Extension>()
+        .context("extract_query_row expects an Extension<Vector> array")?;
+    let fsl: FixedSizeListArray = ext.storage_array().clone().execute(&mut ctx)?;
+
+    let dim_usize = match fsl.dtype() {
+        vortex::dtype::DType::FixedSizeList(_, dim, _) => *dim as usize,
+        other => anyhow::bail!("expected FixedSizeList storage, got {other}"),
+    };
+
+    let elements: PrimitiveArray = fsl.elements().clone().execute(&mut ctx)?;
+    let slice = elements.as_slice::<f32>();
+    let start = row * dim_usize;
+    Ok(slice[start..start + dim_usize].to_vec())
+}
+
+fn score_all_rows(data: &ArrayRef, query: &[f32], session: &VortexSession) -> Result<Vec<f32>> {
+    let num_rows = data.len();
+    let query_vec = build_constant_query_vector(query, num_rows)?;
+    let cosine = CosineSimilarity::try_new_array(data.clone(), query_vec, num_rows)
+        .vortex_expect("cosine similarity accepts matching Vector inputs")
+        .into_array();
+
+    let mut ctx = session.create_execution_ctx();
+    let scores: PrimitiveArray = cosine.execute(&mut ctx)?;
+    Ok(scores.as_slice::<f32>().to_vec())
+}
+
+/// Return the indices of the top-K highest scores, stable-sorted descending.
+fn top_k_indices(scores: &[f32], top_k: usize) -> Vec<usize> {
+    let mut idx: Vec<usize> = (0..scores.len()).collect();
+    idx.sort_by(|&a, &b| {
+        scores[b]
+            .partial_cmp(&scores[a])
+            .unwrap_or(std::cmp::Ordering::Equal)
+    });
+    idx.truncate(top_k);
+    idx
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_bench::SESSION;
+
+    use super::*;
+    use crate::Variant;
+    use crate::prepare_variant;
+    use crate::test_utils::synthetic_vector;
+
+    #[test]
+    fn uncompressed_has_perfect_self_recall() {
+        let dim = 128u32;
+        let num_rows = 64usize;
+        let uncompressed = synthetic_vector(dim, num_rows, 0xC0FFEE);
+
+        let recall = measure_recall_at_k(&uncompressed, &uncompressed, 4, 10, &SESSION).unwrap();
+        assert!(
+            (recall - 1.0).abs() < 1e-9,
+            "self-recall must be 1.0, got {recall}"
+        );
+    }
+
+    #[test]
+    fn turboquant_recall_is_reasonable_for_synthetic_data() {
+        let dim = 128u32;
+        let num_rows = 64usize;
+        let uncompressed = synthetic_vector(dim, num_rows, 0xC0FFEE);
+
+        let prepared = crate::PreparedDataset {
+            name: "synthetic".to_string(),
+            uncompressed: uncompressed.clone(),
+            query: vec![],
+            parquet_bytes: 0,
+        };
+
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .unwrap();
+        let (tq_array, _) = rt
+            .block_on(prepare_variant(
+                &prepared,
+                Variant::VortexTurboQuant,
+                &SESSION,
+            ))
+            .unwrap();
+
+        // With only 64 random rows, recall@10 won't be 1.0 but it should be well
+        // above chance (10/64 ≈ 0.156). The test asserts a loose lower bound to catch
+        // total regressions without being flaky on distribution noise.
+        let recall = measure_recall_at_k(&uncompressed, &tq_array, 4, 10, &SESSION).unwrap();
+        assert!(
+            recall >= 0.3,
+            "TurboQuant recall@10 on 64×128 synthetic data should be ≥0.3, got {recall}",
+        );
+    }
+}

From d8a5b5b6e80e774a581be797bed47a4440bac35e Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 11 Apr 2026 13:06:56 +0000
Subject: [PATCH 06/18] ci: wire vector-search-bench into post-merge bench.yml
 matrix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds `vector-search-bench` as a new entry in the post-merge `bench`
matrix alongside `random-access-bench` and `compress-bench`. Uses the
same build-bin / bench-taskset / gh-json / S3-upload pipeline so
results flow into the existing benchmarks dashboard without any
dashboard-side changes. The four formats passed on the CLI
(`parquet,vortex-uncompressed,vortex-default,vortex-turboquant`) surface
in the emitted JSON as three different `target.format` values —
`Format::Parquet` for the hand-rolled baseline, `Format::OnDiskVortex`
for the two lossless variants, and `Format::VortexTurboQuant` for the
lossy one — so dashboards can render them as distinct bars per metric.

Also promotes the `--variants` clap flag on the binary to `--formats`
for naming consistency with the other standalone benchmarks (the CI
script pipes `--formats ${{ matrix.benchmark.formats }}` uniformly).
The underlying `Variant` enum is unchanged; only the CLI surface moved.

Finally, adds `benchmarks/vector-search-bench/README.md` documenting
what the benchmark measures, the four formats, local-run instructions,
and — importantly — the dataset-mirror caveat. The current dataset URL
points at `assets.zilliz.com` (Zilliz's public anonymous-readable
bucket). Running this matrix entry on every develop merge will create
recurring egress traffic on a third-party bucket; the README explains
how to swap the URL in `VectorDataset::parquet_url` to an internal
mirror before turning this on for a fork with heavy merge frequency.
Skipping `bench-pr.yml` (PR runs) deliberately — v1 is develop-only
until the mirror / egress question is resolved.

Signed-off-by: Claude <noreply@anthropic.com>
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 .github/workflows/bench.yml                |  4 ++
 benchmarks/vector-search-bench/README.md   | 81 ++++++++++++++++++++++
 benchmarks/vector-search-bench/src/main.rs | 55 ++++++++++++---
 3 files changed, 129 insertions(+), 11 deletions(-)
 create mode 100644 benchmarks/vector-search-bench/README.md

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 7be829d80bd..f700aac0a78 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -48,6 +48,10 @@ jobs:
             name: Compression
             build_args: "--features lance"
             formats: "parquet,lance,vortex"
+          - id: vector-search-bench
+            name: Vector Similarity Search
+            build_args: ""
+            formats: "parquet,vortex-uncompressed,vortex-default,vortex-turboquant"
     steps:
       - uses: runs-on/action@v2
         if: github.repository == 'vortex-data/vortex'
diff --git a/benchmarks/vector-search-bench/README.md b/benchmarks/vector-search-bench/README.md
new file mode 100644
index 00000000000..7a10508910f
--- /dev/null
+++ b/benchmarks/vector-search-bench/README.md
@@ -0,0 +1,81 @@
+# vector-search-bench
+
+Brute-force cosine-similarity benchmark for Vortex on public VectorDBBench
+embedding corpora.
+
+## What it measures
+
+For each `(dataset, format)` pair, the benchmark records four numbers:
+
+1. **Size** — compressed storage footprint in bytes. For the Vortex variants
+   that round-trip through `.vortex` files today (uncompressed & BtrBlocks
+   default) this is the real on-disk size. For `vortex-turboquant` it is
+   the in-memory `.nbytes()` footprint, because the `L2Denorm` scalar-fn
+   array does not yet have a concrete `serialize_metadata` implementation.
+2. **Full-scan decode time** — wall time to materialize the whole `Vector<dim, f32>`
+   column into a `FixedSizeListArray<f32>`.
+3. **Cosine-similarity execute time** — wall time for
+   `CosineSimilarity(data, const_query)` executed to a materialized f32 array.
+4. **Cosine-filter execute time** — wall time for the full
+   `Binary(Gt, [CosineSimilarity, threshold])` expression tree executed to
+   a `BoolArray`.
+
+The TurboQuant variant additionally reports **Recall@10** against the
+uncompressed Vortex scan as local ground truth. Lossless variants are trivially
+1.0 so they are not re-measured.
+
+## Formats
+
+- `parquet` — Parquet file read via `parquet::arrow` into an Arrow
+  `FixedSizeListArray<f32>`, then a hand-rolled Rust cosine loop. This is the
+  "what you'd do without Vortex" external floor.
+- `vortex-uncompressed` — Raw `Vector<dim, f32>` extension array, no
+  encoding-level compression applied.
+- `vortex-default` — `BtrBlocksCompressor::default()` applied to the FSL
+  storage child. Generic lossless Vortex compression for float vectors.
+- `vortex-turboquant` — The full
+  `L2Denorm(SorfTransform(FSL(Dict(codes, centroids))), norms)` pipeline.
+  Lossy; recall@10 is reported alongside throughput.
+
+## Datasets
+
+The first dataset wired up is **Cohere-100K** (`cohere-small`): 100K rows ×
+768 dims, cosine metric, ~150 MB zstd-parquet. This is the smallest
+VectorDBBench-supplied embedding corpus and sits comfortably inside a CI
+time / bandwidth budget.
+
+The upstream URL is
+`https://assets.zilliz.com/benchmark/cohere_small_100k/train.parquet`. The
+public Zilliz bucket is anonymous-readable so the code _can_ hit it directly.
+
+## Running locally
+
+```bash
+cargo run -p vector-search-bench --release -- \
+    --datasets cohere-small \
+    --formats parquet,vortex-uncompressed,vortex-default,vortex-turboquant \
+    --iterations 5 \
+    -d table
+```
+
+The first run downloads the parquet file into
+`vortex-bench/data/cohere-small/cohere-small.parquet` and caches it
+idempotently for subsequent runs.
+
+## CI note: dataset mirror
+
+CI runs after every develop-branch merge. Hitting `assets.zilliz.com`
+from every merge would create recurring egress traffic on a third-party
+bucket — the same courtesy reason `RPlace` / `AirQuality` are excluded
+from CI in `compress-bench`.
+
+Before enabling the `vector-search-bench` entry in `.github/workflows/bench.yml`
+on a fork, either:
+
+1. **Mirror the file into an internal bucket** and swap the URL in
+   `vortex-bench/src/vector_dataset.rs::VectorDataset::parquet_url`, or
+2. **Accept the upstream egress cost** and leave the URL as-is.
+
+The mirror step is a one-off `aws s3 cp` and is documented here rather
+than automated in the build because the destination bucket is
+organization-specific.
diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs
index 5a5ef57a742..27096947317 100644
--- a/benchmarks/vector-search-bench/src/main.rs
+++ b/benchmarks/vector-search-bench/src/main.rs
@@ -54,14 +54,13 @@ struct Args {
     #[arg(long, value_delimiter = ',', value_enum, default_values_t = vec![SelectableDataset::CohereSmall])]
     datasets: Vec<SelectableDataset>,
 
-    /// Subset of variants to exercise. Defaults to all three Vortex variants.
-    #[arg(long, value_delimiter = ',', value_enum, default_values_t = vec![Variant::VortexUncompressed, Variant::VortexDefault, Variant::VortexTurboQuant])]
-    variants: Vec<Variant>,
-
-    /// Also run the Parquet-Arrow hand-rolled cosine baseline as an additional variant.
-    /// Default `true` — disable only when you intentionally want a Vortex-only comparison.
-    #[arg(long, default_value_t = true)]
-    parquet_baseline: bool,
+    /// Which benchmark variants to run, using kebab-cased labels. The `--formats` name is
+    /// used (instead of `--variants`) so this benchmark matches the CI invocation
+    /// convention shared across random-access-bench / compress-bench. Accepted values:
+    /// `parquet`, `vortex-uncompressed`, `vortex-default`, `vortex-turboquant`. Defaults
+    /// to running all four.
+    #[arg(long, value_delimiter = ',', value_enum, default_values_t = vec![SelectableFormat::Parquet, SelectableFormat::VortexUncompressed, SelectableFormat::VortexDefault, SelectableFormat::VortexTurboQuant])]
+    formats: Vec<SelectableFormat>,
 
     /// Number of query rows sampled when computing Recall@K for TurboQuant. 0 disables
     /// the quality measurement entirely (useful for smoke tests).
@@ -103,6 +102,33 @@ impl SelectableDataset {
     }
 }
 
+#[derive(clap::ValueEnum, Clone, Copy, Debug, PartialEq, Eq)]
+enum SelectableFormat {
+    /// Parquet-Arrow hand-rolled cosine loop baseline.
+    #[clap(name = "parquet")]
+    Parquet,
+    /// Raw `Vector<dim, f32>` with no encoding compression.
+    #[clap(name = "vortex-uncompressed")]
+    VortexUncompressed,
+    /// BtrBlocks default-compression applied to the FSL storage child.
+    #[clap(name = "vortex-default")]
+    VortexDefault,
+    /// Full TurboQuant pipeline (lossy).
+    #[clap(name = "vortex-turboquant")]
+    VortexTurboQuant,
+}
+
+impl SelectableFormat {
+    fn into_variant(self) -> Option<Variant> {
+        match self {
+            SelectableFormat::Parquet => None,
+            SelectableFormat::VortexUncompressed => Some(Variant::VortexUncompressed),
+            SelectableFormat::VortexDefault => Some(Variant::VortexDefault),
+            SelectableFormat::VortexTurboQuant => Some(Variant::VortexTurboQuant),
+        }
+    }
+}
+
 #[tokio::main]
 async fn main() -> Result<()> {
     let args = Args::parse();
@@ -115,7 +141,14 @@ async fn main() -> Result<()> {
         .map(SelectableDataset::into_dataset)
         .collect();
 
-    let total_work = datasets.len() * args.variants.len();
+    let run_parquet_baseline = args.formats.contains(&SelectableFormat::Parquet);
+    let variants: Vec<Variant> = args
+        .formats
+        .iter()
+        .filter_map(|f| f.into_variant())
+        .collect();
+
+    let total_work = datasets.len() * args.formats.len();
     let progress = ProgressBar::new(total_work as u64);
 
     let mut timings: Vec<CompressionTimingMeasurement> = Vec::new();
@@ -135,7 +168,7 @@ async fn main() -> Result<()> {
         // Parquet-Arrow baseline. Emitted as a separate pseudo-variant with label
         // `parquet` / Format::Parquet so it shows up in dashboards next to the Vortex
         // variants.
-        if args.parquet_baseline {
+        if run_parquet_baseline {
             let parquet_path = dataset.to_parquet_path().await?;
             let baseline_timings = run_parquet_baseline_timings(
                 &parquet_path,
@@ -170,7 +203,7 @@ async fn main() -> Result<()> {
             });
         }
 
-        for &variant in &args.variants {
+        for &variant in &variants {
             let (variant_array, size_bytes) = prepare_variant(&prepared, variant, &SESSION).await?;
 
             let variant_label = variant.label();

From b286070eff4d06533c6ca927635273ba74e83386 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 11 Apr 2026 13:15:00 +0000
Subject: [PATCH 07/18] vortex-bench: add Cohere-medium, OpenAI, Bioasq, Glove
 datasets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extends `VectorDataset` with five additional cosine-metric corpora from
the VectorDBBench collection:

- CohereMedium: 1M × 768d (wiki-22-12)
- OpenAiSmall: 50K × 1536d (OpenAI-on-C4)
- OpenAiMedium: 500K × 1536d (OpenAI-on-C4)
- BioasqMedium: 1M × 1024d (biomedical)
- GloveMedium: 1M × 200d (word embeddings)

All six built-in datasets are exported via a new `ALL_VECTOR_DATASETS`
slice and exposed through the `vector-search-bench` `--datasets` flag
so local runs can pick any of them.

Adds an `all_datasets_have_consistent_metadata` test that verifies every
built-in variant has a unique kebab-cased name, a `train.parquet` URL
under `assets.zilliz.com/benchmark/`, a dimension above the TurboQuant
minimum (128), and cosine metric (v1 only wires cosine). L2 / IP
datasets (SIFT, GIST, LAION) remain out of scope until vortex-tensor
gains an L2-distance scalar function.

Signed-off-by: Claude <noreply@anthropic.com>
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 benchmarks/vector-search-bench/src/main.rs |  15 +++
 vortex-bench/src/vector_dataset.rs         | 106 ++++++++++++++++++++-
 2 files changed, 116 insertions(+), 5 deletions(-)

diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs
index 27096947317..7d65e29598f 100644
--- a/benchmarks/vector-search-bench/src/main.rs
+++ b/benchmarks/vector-search-bench/src/main.rs
@@ -92,12 +92,27 @@ struct Args {
 enum SelectableDataset {
     #[clap(name = "cohere-small")]
     CohereSmall,
+    #[clap(name = "cohere-medium")]
+    CohereMedium,
+    #[clap(name = "openai-small")]
+    OpenAiSmall,
+    #[clap(name = "openai-medium")]
+    OpenAiMedium,
+    #[clap(name = "bioasq-medium")]
+    BioasqMedium,
+    #[clap(name = "glove-medium")]
+    GloveMedium,
 }
 
 impl SelectableDataset {
     fn into_dataset(self) -> VectorDataset {
         match self {
             SelectableDataset::CohereSmall => VectorDataset::CohereSmall,
+            SelectableDataset::CohereMedium => VectorDataset::CohereMedium,
+            SelectableDataset::OpenAiSmall => VectorDataset::OpenAiSmall,
+            SelectableDataset::OpenAiMedium => VectorDataset::OpenAiMedium,
+            SelectableDataset::BioasqMedium => VectorDataset::BioasqMedium,
+            SelectableDataset::GloveMedium => VectorDataset::GloveMedium,
         }
     }
 }
diff --git a/vortex-bench/src/vector_dataset.rs b/vortex-bench/src/vector_dataset.rs
index d6a97d20ba9..86086ac57ac 100644
--- a/vortex-bench/src/vector_dataset.rs
+++ b/vortex-bench/src/vector_dataset.rs
@@ -37,30 +37,77 @@ use crate::idempotent_async;
 ///
 /// Each variant is one of the canonical VectorDBBench corpora, distributed as parquet under
 /// the Zilliz public benchmark bucket. The smaller `*Small` sizes are appropriate for CI
-/// runs; the larger sizes are intended for local / on-demand experiments.
+/// runs; the larger sizes are intended for local / on-demand experiments. Only
+/// cosine-metric datasets are wired today — SIFT / GIST / LAION (L2) will follow when an
+/// L2-distance scalar function lands in `vortex-tensor`.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum VectorDataset {
     /// Cohere wiki-22-12, 100K rows × 768 dims, cosine metric. ~307 MB raw / ~150 MB
     /// zstd-parquet — the default CI-friendly size.
     CohereSmall,
+    /// Cohere wiki-22-12, 1M rows × 768 dims, cosine metric. ~3 GB raw / ~1.5 GB
+    /// zstd-parquet. Local-only by default.
+    CohereMedium,
+    /// OpenAI embeddings on C4, 50K rows × 1536 dims, cosine metric. ~307 MB raw —
+    /// the smallest OpenAI variant and comparable in size to Cohere-small, but with
+    /// double the dimensionality.
+    OpenAiSmall,
+    /// OpenAI embeddings on C4, 500K rows × 1536 dims, cosine metric. ~3 GB raw.
+    /// Local-only by default.
+    OpenAiMedium,
+    /// Bioasq biomedical embeddings, 1M rows × 1024 dims, cosine metric. ~4 GB raw.
+    /// Local-only by default.
+    BioasqMedium,
+    /// Glove word embeddings, 1M rows × 200 dims, cosine metric. ~800 MB raw.
+    GloveMedium,
 }
 
+/// All built-in [`VectorDataset`] variants in a fixed order. Convenient for iterating or
+/// for listing choices in CLI help.
+pub const ALL_VECTOR_DATASETS: &[VectorDataset] = &[
+    VectorDataset::CohereSmall,
+    VectorDataset::CohereMedium,
+    VectorDataset::OpenAiSmall,
+    VectorDataset::OpenAiMedium,
+    VectorDataset::BioasqMedium,
+    VectorDataset::GloveMedium,
+];
+
 impl VectorDataset {
     /// The upstream URL for this dataset's canonical train-split parquet file.
     ///
-    /// **CI note**: point at an internal mirror before enabling this benchmark in CI.
+    /// **CI note**: point at an internal mirror before enabling this benchmark in CI —
+    /// see `benchmarks/vector-search-bench/README.md` for the procedure.
     pub fn parquet_url(&self) -> &'static str {
         match self {
             VectorDataset::CohereSmall => {
                 "https://assets.zilliz.com/benchmark/cohere_small_100k/train.parquet"
             }
+            VectorDataset::CohereMedium => {
+                "https://assets.zilliz.com/benchmark/cohere_medium_1m/train.parquet"
+            }
+            VectorDataset::OpenAiSmall => {
+                "https://assets.zilliz.com/benchmark/openai_small_50k/train.parquet"
+            }
+            VectorDataset::OpenAiMedium => {
+                "https://assets.zilliz.com/benchmark/openai_medium_500k/train.parquet"
+            }
+            VectorDataset::BioasqMedium => {
+                "https://assets.zilliz.com/benchmark/bioasq_medium_1m/train.parquet"
+            }
+            VectorDataset::GloveMedium => {
+                "https://assets.zilliz.com/benchmark/glove_medium_1m/train.parquet"
+            }
         }
     }
 
     /// Fixed vector dimensionality for this dataset.
     pub fn dim(&self) -> u32 {
         match self {
-            VectorDataset::CohereSmall => 768,
+            VectorDataset::CohereSmall | VectorDataset::CohereMedium => 768,
+            VectorDataset::OpenAiSmall | VectorDataset::OpenAiMedium => 1536,
+            VectorDataset::BioasqMedium => 1024,
+            VectorDataset::GloveMedium => 200,
         }
     }
 
@@ -68,14 +115,25 @@ impl VectorDataset {
     pub fn num_rows(&self) -> usize {
         match self {
             VectorDataset::CohereSmall => 100_000,
+            VectorDataset::CohereMedium => 1_000_000,
+            VectorDataset::OpenAiSmall => 50_000,
+            VectorDataset::OpenAiMedium => 500_000,
+            VectorDataset::BioasqMedium => 1_000_000,
+            VectorDataset::GloveMedium => 1_000_000,
         }
     }
 
     /// The distance metric the upstream dataset was curated for. v1 only wires cosine, so
-    /// this is informational today.
+    /// every built-in dataset returns [`VectorMetric::Cosine`]. The enum variant exists so
+    /// that L2 / inner-product datasets can be added later without a breaking change.
     pub fn metric(&self) -> VectorMetric {
         match self {
-            VectorDataset::CohereSmall => VectorMetric::Cosine,
+            VectorDataset::CohereSmall
+            | VectorDataset::CohereMedium
+            | VectorDataset::OpenAiSmall
+            | VectorDataset::OpenAiMedium
+            | VectorDataset::BioasqMedium
+            | VectorDataset::GloveMedium => VectorMetric::Cosine,
         }
     }
 }
@@ -98,6 +156,11 @@ impl Dataset for VectorDataset {
     fn name(&self) -> &str {
         match self {
             VectorDataset::CohereSmall => "cohere-small",
+            VectorDataset::CohereMedium => "cohere-medium",
+            VectorDataset::OpenAiSmall => "openai-small",
+            VectorDataset::OpenAiMedium => "openai-medium",
+            VectorDataset::BioasqMedium => "bioasq-medium",
+            VectorDataset::GloveMedium => "glove-medium",
         }
     }
 
@@ -142,6 +205,9 @@ impl Dataset for VectorDataset {
 
 #[cfg(test)]
 mod tests {
+    use vortex::utils::aliases::hash_set::HashSet;
+
+    use super::ALL_VECTOR_DATASETS;
     use super::VectorDataset;
     use super::VectorMetric;
     use crate::datasets::Dataset;
@@ -156,4 +222,34 @@ mod tests {
         assert!(ds.parquet_url().ends_with("/train.parquet"));
         assert!(ds.parquet_url().contains("cohere_small_100k"));
     }
+
+    #[test]
+    fn all_datasets_have_consistent_metadata() {
+        // Every built-in dataset must have a unique kebab-cased name, point at a
+        // `train.parquet` file under `assets.zilliz.com/benchmark/`, declare a
+        // dimension ≥ `MIN_DIMENSION` for TurboQuant, a non-zero row count, and
+        // (for v1) cosine metric.
+        let mut seen_names: HashSet<String> = HashSet::default();
+        for &ds in ALL_VECTOR_DATASETS {
+            let name = ds.name();
+            assert!(
+                seen_names.insert(name.to_string()),
+                "duplicate dataset name {name}",
+            );
+            assert!(
+                ds.dim() >= 128,
+                "{name} dim {} below TurboQuant minimum",
+                ds.dim()
+            );
+            assert!(ds.num_rows() > 0, "{name} has zero rows");
+            assert_eq!(
+                ds.metric(),
+                VectorMetric::Cosine,
+                "{name} must be cosine for v1"
+            );
+            let url = ds.parquet_url();
+            assert!(url.starts_with("https://assets.zilliz.com/benchmark/"));
+            assert!(url.ends_with("/train.parquet"));
+        }
+    }
 }

From 616acdcba688fc1ccf9b0391072cbdd31fb556b3 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 11 Apr 2026 13:27:06 +0000
Subject: [PATCH 08/18] vector-search-bench: handle ListView, canonicalize
 chunks, add synth generator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes two issues discovered while running the benchmark end-to-end
against a real parquet file in a no-network environment:

1. `parquet_to_vortex_chunks` produces `ListView` arrays for list
   columns by default, so `list_to_vector_ext` was rejecting them with
   "expects a List array, got list(f32)". Added a fast path that
   applies `recursive_list_from_list_view` before the offset-stride
   validation kicks in.

2. `list_to_vector_ext` returns a `ChunkedArray<Extension<Vector>>`
   when given a chunked input (the normal shape after parquet ingest),
   but downstream `extract_query_row` expects a single non-chunked
   `Extension<Vector>`. `prepare_dataset` now calls
   `wrapped.execute::<ExtensionArray>(&mut ctx)` to materialize the
   chunked wrapper into a single unchunked extension array.

Also adds a new `gen_synthetic_dataset` helper binary that writes a
VectorDBBench-shape parquet file (`id: int64`, `emb: list<float32>`,
zstd-compressed) with deterministic xorshift-generated vectors. This
is useful for:

- Local dev runs of `vector-search-bench` without needing outbound
  network access to `assets.zilliz.com`.
- Populating the `vortex-bench/data/<name>/<name>.parquet` cache path
  so the benchmark's idempotent download step skips the HTTP fetch.
- Sanity-checking the benchmark pipeline in CI environments that
  block outbound HTTPS to third-party buckets.

Verified end-to-end against 5000×768 synthetic Cohere-small and
5000×1536 synthetic OpenAI-small inputs — every measurement pipeline
(size, decode, cosine, filter, recall@10) produces valid gh-json and
TurboQuant recall@10 lands at 0.91 on both fixtures, consistent with
the loose recall floor the unit test enforces.

Signed-off-by: Claude <noreply@anthropic.com>
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 Cargo.lock                                    |   1 +
 benchmarks/vector-search-bench/Cargo.toml     |   1 +
 .../src/bin/gen_synthetic_dataset.rs          | 139 ++++++++++++++++++
 benchmarks/vector-search-bench/src/lib.rs     |  11 +-
 vortex-bench/src/conversions.rs               |   9 ++
 5 files changed, 160 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs

diff --git a/Cargo.lock b/Cargo.lock
index c56f8e5330e..ebc8530825a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10058,6 +10058,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "arrow-array 58.0.0",
+ "arrow-buffer 58.0.0",
  "arrow-schema 58.0.0",
  "async-trait",
  "clap",
diff --git a/benchmarks/vector-search-bench/Cargo.toml b/benchmarks/vector-search-bench/Cargo.toml
index 068cf400d71..8e8101ddcee 100644
--- a/benchmarks/vector-search-bench/Cargo.toml
+++ b/benchmarks/vector-search-bench/Cargo.toml
@@ -17,6 +17,7 @@ publish = false
 [dependencies]
 anyhow = { workspace = true }
 arrow-array = { workspace = true }
+arrow-buffer = { workspace = true }
 arrow-schema = { workspace = true }
 async-trait = { workspace = true }
 clap = { workspace = true, features = ["derive"] }
diff --git a/benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs b/benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs
new file mode 100644
index 00000000000..3f5500edfcf
--- /dev/null
+++ b/benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Write a synthetic parquet file matching the VectorDBBench `emb: list<f32>` + `id: int64`
+//! schema. Useful for local dev runs of `vector-search-bench` without needing network
+//! access to `assets.zilliz.com`, and for sandbox / CI environments that block outbound
+//! HTTPS.
+//!
+//! The generated file is bit-identical across runs for a given `(num_rows, dim, seed)`
+//! triple so that downstream benchmark output is reproducible.
+//!
+//! Example:
+//!
+//! ```bash
+//! cargo run -p vector-search-bench --bin gen-synthetic-dataset --release -- \
+//!     --num-rows 5000 \
+//!     --dim 768 \
+//!     --out vortex-bench/data/cohere-small/cohere-small.parquet
+//! ```
+//!
+//! After running this, `vector-search-bench --datasets cohere-small` will find the
+//! cached parquet file and skip the HTTP download via `idempotent_async`.
+
+use std::fs::File;
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use anyhow::Context;
+use anyhow::Result;
+use arrow_array::Int64Array;
+use arrow_array::ListArray;
+use arrow_array::RecordBatch;
+use arrow_array::builder::Float32Builder;
+use arrow_array::builder::Int32BufferBuilder;
+use arrow_schema::DataType;
+use arrow_schema::Field;
+use arrow_schema::Schema;
+use clap::Parser;
+use parquet::arrow::ArrowWriter;
+use parquet::basic::Compression;
+use parquet::file::properties::WriterProperties;
+
+#[derive(Parser, Debug)]
+#[command(
+    version,
+    about = "Generate a synthetic VectorDBBench-style parquet file"
+)]
+struct Args {
+    /// Number of rows to generate.
+    #[arg(long, default_value_t = 5000)]
+    num_rows: usize,
+
+    /// Vector dimensionality. Must be ≥ 128 to exercise TurboQuant.
+    #[arg(long, default_value_t = 768)]
+    dim: u32,
+
+    /// Deterministic PRNG seed — changing this changes the generated vectors.
+    #[arg(long, default_value_t = 0xC0FFEE)]
+    seed: u64,
+
+    /// Output parquet file path. Parent directory is created if missing.
+    #[arg(long)]
+    out: PathBuf,
+}
+
+fn main() -> Result<()> {
+    let args = Args::parse();
+
+    if let Some(parent) = args.out.parent() {
+        std::fs::create_dir_all(parent)?;
+    }
+
+    // Build an Arrow `ListArray<f32>` so the schema matches VectorDBBench's `emb:
+    // list<float>` (note: NOT fixed_size_list — parquet has no FSL logical type so
+    // arrow-rs writes lists). Every list has exactly `dim` elements.
+    let dim_usize = args.dim as usize;
+    let total_elements = args.num_rows * dim_usize;
+
+    let mut float_values = Float32Builder::with_capacity(total_elements);
+    let mut offsets = Int32BufferBuilder::new(args.num_rows + 1);
+    offsets.append(0i32);
+
+    let mut state = args.seed.wrapping_add(1);
+    for row in 0..args.num_rows {
+        for i in 0..dim_usize {
+            // Deterministic xorshift mixed with position so every vector is distinct.
+            state ^= state << 13;
+            state ^= state >> 7;
+            state ^= state << 17;
+            let scale = 1.0f32 / 32768.0;
+            let v = ((state & 0xFFFF) as f32 * scale - 0.5)
+                + ((row as f32 * 0.00013) + (i as f32 * 0.00007)).sin() * 0.25;
+            float_values.append_value(v);
+        }
+        let written = i32::try_from((row + 1) * dim_usize)
+            .context("offset overflows i32 — reduce num_rows or dim")?;
+        offsets.append(written);
+    }
+
+    let values_array = float_values.finish();
+    let offsets_buffer = offsets.finish();
+
+    let field = Arc::new(Field::new("item", DataType::Float32, false));
+    let list_dtype = DataType::List(Arc::clone(&field));
+    let list_array = ListArray::try_new(
+        Arc::clone(&field),
+        arrow_buffer::OffsetBuffer::new(offsets_buffer.into()),
+        Arc::new(values_array),
+        None,
+    )?;
+
+    let ids: Int64Array = (0..args.num_rows as i64).collect();
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int64, false),
+        Field::new("emb", list_dtype, false),
+    ]));
+
+    let batch = RecordBatch::try_new(
+        Arc::clone(&schema),
+        vec![Arc::new(ids), Arc::new(list_array)],
+    )?;
+
+    let writer_props = WriterProperties::builder()
+        .set_compression(Compression::ZSTD(Default::default()))
+        .build();
+    let file = File::create(&args.out)?;
+    let mut writer = ArrowWriter::try_new(file, schema, Some(writer_props))?;
+    writer.write(&batch)?;
+    writer.close()?;
+
+    println!(
+        "wrote {} rows × {} dims to {}",
+        args.num_rows,
+        args.dim,
+        args.out.display()
+    );
+    Ok(())
+}
diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs
index 27f61f39ed7..39a780875d1 100644
--- a/benchmarks/vector-search-bench/src/lib.rs
+++ b/benchmarks/vector-search-bench/src/lib.rs
@@ -152,6 +152,8 @@ impl PreparedDataset {
 /// Prepare a dataset by downloading its parquet file, converting the `emb` column to a
 /// `Vector<dim, f32>` extension array, and extracting a single-row query vector.
 pub async fn prepare_dataset(dataset: &VectorDataset) -> Result<PreparedDataset> {
+    use vortex::array::arrays::ExtensionArray;
+
     let parquet_path = dataset
         .to_parquet_path()
         .await
@@ -171,7 +173,14 @@ pub async fn prepare_dataset(dataset: &VectorDataset) -> Result<PreparedDataset>
 
     let struct_array = chunked.into_array();
     let emb_column = extract_emb_column(&struct_array)?;
-    let uncompressed = list_to_vector_ext(emb_column)?;
+    let wrapped = list_to_vector_ext(emb_column)?;
+
+    // `list_to_vector_ext` may return a chunked `Extension<Vector>` when the source was
+    // a `ChunkedArray` of list columns (the usual shape after `parquet_to_vortex_chunks`).
+    // Materialize it into a single non-chunked `ExtensionArray` so downstream code can
+    // treat it uniformly.
+    let mut ctx = SESSION.create_execution_ctx();
+    let uncompressed = wrapped.execute::<ExtensionArray>(&mut ctx)?.into_array();
 
     let query = extract_query_row(&uncompressed, DEFAULT_QUERY_ROW)?;
 
diff --git a/vortex-bench/src/conversions.rs b/vortex-bench/src/conversions.rs
index 13e295e13c4..e31c3520d2f 100644
--- a/vortex-bench/src/conversions.rs
+++ b/vortex-bench/src/conversions.rs
@@ -26,8 +26,10 @@ use vortex::array::arrays::ChunkedArray;
 use vortex::array::arrays::ExtensionArray;
 use vortex::array::arrays::FixedSizeListArray;
 use vortex::array::arrays::List;
+use vortex::array::arrays::ListView;
 use vortex::array::arrays::chunked::ChunkedArrayExt;
 use vortex::array::arrays::list::ListArrayExt;
+use vortex::array::arrays::listview::recursive_list_from_list_view;
 use vortex::array::arrow::FromArrowArray;
 use vortex::array::builders::builder_with_capacity;
 use vortex::array::extension::EmptyMetadata;
@@ -267,6 +269,13 @@ pub fn list_to_vector_ext(input: ArrayRef) -> VortexResult<ArrayRef> {
         return Ok(ChunkedArray::from_iter(converted).into_array());
     }
 
+    // `parquet_to_vortex_chunks` produces `ListView` arrays for list columns by default;
+    // materialize them into a flat `List` representation before we validate offsets.
+    if input.as_opt::<ListView>().is_some() {
+        let flat = recursive_list_from_list_view(input)?;
+        return list_to_vector_ext(flat);
+    }
+
     let Some(list) = input.as_opt::<List>() else {
         vortex_bail!(
             "list_to_vector_ext expects a List array, got dtype {}",

From a12f0d054950c73dc8941d5d6df485ff65520615 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 11 Apr 2026 14:07:12 +0000
Subject: [PATCH 09/18] vector-search-bench: verify correctness + fix
 misleading size / decompress measurements
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A thorough audit of the benchmark's outputs found three issues that together
made the results hard to interpret:

1. **"Size" was lying.** The old `measure_on_disk_size` helper wrote every
   variant through `SESSION.write_options()`, which uses
   `WriteStrategyBuilder::default()` — and that default applies BtrBlocks
   compression regardless of the in-memory tree. So both
   `vortex-uncompressed` and `vortex-default` ended up writing identical
   ALP-RD-compressed bytes to disk, reporting the same "size" number even
   though the two in-memory trees were different. Switched to reporting
   `ArrayRef::nbytes()` of the in-memory variant tree instead — the honest
   footprint of what the compute measurements actually operate on.

   Running the diagnostic `print_variant_trees` test now shows what each
   variant really is:

   ```
   VortexUncompressed   nbytes=15,360,000  Extension → FSL → Primitive<f32>
   VortexDefault        nbytes=13,072,050  Extension → FSL → alprd(f32)
                                             [bitpacked u16/u32 + patches]
   VortexTurboQuant     nbytes= 5,141,024  L2Denorm → SorfTransform →
                                             FSL → Dict(u8 codes + f32[256])
   ```

   So BtrBlocks really does find a ~15% lossless saving on random f32 via
   ALP-RD, and the previously-observed 3.5× cosine slowdown is the real
   cost of decoding ALP-RD on the fly inside `CosineSimilarity`.

2. **"Decompress time" for the compressed variants was a no-op.** The old
   `decode_full_scan` only executed the Extension shell and the FSL
   storage; it never forced the element buffer to materialize. Since FSL
   canonical form can keep its elements in their encoded representation,
   this reported ~60 ns for `vortex-default` even though reading any value
   would actually have to run the ALP-RD unpacking. Renamed to
   `decompress_full_scan` and added a final
   `fsl.elements().execute::<PrimitiveArray>()` call that forces the lazy
   decode to completion. `vortex-default`'s decompress time now shows
   ~6 ms on 5000×768 rows — the actual ALP-RD cost.

3. **No correctness verification.** The benchmark reported throughput
   numbers with no guarantee that the variants were producing the same
   answers. A new `verify.rs` module now computes cosine-similarity
   scores for one query against every variant *before* timing, compares
   them to the uncompressed baseline, and bails the run if any lossless
   variant drifts beyond 1e-4 max-abs-diff or any lossy variant drifts
   beyond 0.2 max-abs-diff. The `execute_cosine` implementation is
   promoted to `pub` and shared between the timing loop and verification
   so both paths exercise the exact same expression tree.

Also adds a new `compress time` measurement alongside `decompress time`,
so the dashboard can show the full round-trip cost per variant
(previously we only timed one side). The TurboQuant verification is
emitted as a warning instead of a hard error — the lossy tolerance is a
quality-budget alert, not a correctness invariant.

New unit tests (13 total in vector-search-bench, up from 4):
- `verify::*` — 8 tests covering the `compare_scores`, `verify_scores`,
  and `verify_variant` paths including NaN handling and end-to-end
  lossless/lossy checks on synthetic 64×128 data.
- `tests::uncompressed_decompress_is_fast` — regression guard ensuring
  the uncompressed variant's decompress pass is >5× faster than
  TurboQuant's so future refactors can't accidentally make the
  "uncompressed" path take the slow path.
- `tests::print_variant_trees` — `#[ignore]` diagnostic test that dumps
  `display_tree_encodings_only()` for each variant. This is what was
  used to root-cause the size-lying issue.

Verified end-to-end on synthetic 5000×768 Cohere-small:
  parquet              nbytes=14,511,306  cosine= 2.9 ms  correctness=0.00e0
  vortex-uncompressed  nbytes=15,360,000  cosine= 7.9 ms  correctness=0.00e0
  vortex-default       nbytes=13,072,050  cosine=20.5 ms  correctness=0.00e0
  vortex-turboquant    nbytes= 5,141,024  cosine=59.1 ms  correctness=5.18e-3
  recall@10 (tq)       0.9150

All lossless variants now produce bit-identical cosine scores (max diff
exactly 0.0) against the uncompressed baseline, confirming the ALP-RD
and hand-rolled parquet paths are correct. TurboQuant's 5.18e-3 drift
is well within the 0.2 lossy tolerance.

Signed-off-by: Claude <noreply@anthropic.com>
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 benchmarks/vector-search-bench/src/lib.rs     | 281 +++++++++-----
 benchmarks/vector-search-bench/src/main.rs    | 127 ++++++-
 .../src/parquet_baseline.rs                   |   6 +-
 benchmarks/vector-search-bench/src/recall.rs  |  36 +-
 benchmarks/vector-search-bench/src/verify.rs  | 344 ++++++++++++++++++
 5 files changed, 667 insertions(+), 127 deletions(-)
 create mode 100644 benchmarks/vector-search-bench/src/verify.rs

diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs
index 39a780875d1..fe27400dce6 100644
--- a/benchmarks/vector-search-bench/src/lib.rs
+++ b/benchmarks/vector-search-bench/src/lib.rs
@@ -3,27 +3,38 @@
 
 //! Vector similarity-search benchmark core.
 //!
-//! This crate measures four quantities for each `(dataset, variant)` pair:
+//! For each `(dataset, variant)` pair we report:
 //!
-//! 1. **Compressed storage size** (bytes on disk, or in-memory `.nbytes()` for variants that
-//!    don't yet serialize — currently just [`Variant::VortexTurboQuant`]).
-//! 2. **Full-scan decode time** — executing the `Vector<dim, f32>` column into a
-//!    materialized [`vortex::array::arrays::FixedSizeListArray`].
-//! 3. **Cosine-similarity execute time** — executing
-//!    `CosineSimilarity(data, const_query)` into a materialized f32 primitive array.
-//! 4. **Filter execute time** — executing
-//!    `Binary(Gt, [CosineSimilarity, threshold])` into a
-//!    [`vortex::array::arrays::BoolArray`].
+//! - **In-memory size** — `ArrayRef::nbytes()` of the prepared variant tree. This is the
+//!   memory footprint you'd pay to keep that encoding resident.
+//! - **Compress time** — the wall time to build the variant tree from the materialized
+//!   uncompressed source (0 for the uncompressed variant itself, the BtrBlocks pass for
+//!   `vortex-default`, the full L2Denorm+SORF+quantize pipeline for `vortex-turboquant`).
+//! - **Decompress time** — the wall time to execute the variant tree back into a
+//!   canonical `FixedSizeListArray` (≈0 for the already-canonical uncompressed variant,
+//!   meaningful for the compressed variants).
+//! - **Cosine time** — executing `CosineSimilarity(data, const_query)` to a materialized
+//!   f32 primitive array.
+//! - **Filter time** — executing `Binary(Gt, [cosine, threshold])` to a `BoolArray`.
+//! - **Recall@10** (for the lossy TurboQuant variant only) against exact top-10 from the
+//!   uncompressed variant.
 //!
-//! Measurements are emitted via the existing `vortex_bench::measurements` types so that
-//! the benchmark results flow through the standard `gh-json` pipeline and appear in the
-//! CI dashboard alongside compress-bench / random-access-bench results.
+//! Before any timing begins, the benchmark also runs a **correctness verification** pass
+//! via [`verify`]: for every variant it computes cosine scores for a single query and
+//! compares them to the ground-truth scores from the uncompressed variant. Lossless
+//! variants must match within [`verify::LOSSLESS_TOLERANCE`]; lossy variants must match
+//! within [`verify::LOSSY_TOLERANCE`]. A correctness failure bails the run.
+//!
+//! Measurements are emitted via the existing `vortex_bench::measurements` types so
+//! results flow through the standard `gh-json` pipeline and show up on the CI dashboard
+//! alongside compress-bench / random-access-bench.
 
 use std::time::Duration;
 use std::time::Instant;
 
 pub mod parquet_baseline;
 pub mod recall;
+pub mod verify;
 
 use anyhow::Context;
 use anyhow::Result;
@@ -260,67 +271,82 @@ fn extract_query_row(vector_ext: &ArrayRef, row: usize) -> Result<Vec<f32>> {
     Ok(slice[start..start + dim_usize].to_vec())
 }
 
-/// Apply a `Variant`'s preparation strategy to the uncompressed Vortex array and return the
-/// prepared array together with its reported size in bytes. For serializable variants the
-/// size is the number of bytes written to a `.vortex` file; for in-memory-only variants
-/// (TurboQuant) it's the live `.nbytes()` footprint.
-pub async fn prepare_variant(
+/// A prepared variant: the in-memory array tree plus the metadata we want to report
+/// alongside it (size and construction cost).
+#[derive(Debug, Clone)]
+pub struct PreparedVariant {
+    /// The variant's in-memory array tree. For the uncompressed variant this is the same
+    /// canonical `Extension<Vector>` pulled out of `prepare_dataset`; for the others it's
+    /// the output of the respective compression pipeline.
+    pub array: ArrayRef,
+    /// Summed byte footprint of the variant tree — `ArrayRef::nbytes()`. This is the
+    /// in-memory cost of keeping the variant resident, not a disk size.
+    pub nbytes: u64,
+    /// Wall time spent constructing the variant tree from the already-materialized
+    /// uncompressed source. 0 for [`Variant::VortexUncompressed`]; meaningful for the
+    /// compressed variants.
+    pub compress_duration: Duration,
+}
+
+/// Apply a `Variant`'s preparation strategy to the materialized uncompressed source and
+/// return the resulting tree together with its reported in-memory size and construction
+/// time.
+///
+/// **Why nbytes instead of on-disk size?** The Vortex file writer applies BtrBlocks
+/// compression as part of its default write strategy regardless of the in-memory tree
+/// shape, so serializing an "uncompressed" tree and measuring the resulting `.vortex`
+/// file produces the same bytes as serializing a `BtrBlocksCompressor::default()`-
+/// compressed tree — the disk-size comparison collapses two conceptually different
+/// things into one number. Reporting `nbytes()` of the in-memory tree keeps the size
+/// measurement consistent with what the *compute* measurements operate on.
+pub fn prepare_variant(
     prepared: &PreparedDataset,
     variant: Variant,
     session: &VortexSession,
-) -> Result<(ArrayRef, u64)> {
+) -> Result<PreparedVariant> {
     match variant {
         Variant::VortexUncompressed => {
+            // Identity: the uncompressed Extension<Vector> is already materialized. Still
+            // record a dummy Instant so the timing point has a well-defined value even
+            // if it's effectively zero.
+            let start = Instant::now();
             let array = prepared.uncompressed.clone();
-            let size =
-                measure_on_disk_size(&array, session, &prepared.name, "uncompressed").await?;
-            Ok((array, size))
+            let compress_duration = start.elapsed();
+            let nbytes = array.nbytes();
+            Ok(PreparedVariant {
+                array,
+                nbytes,
+                compress_duration,
+            })
         }
         Variant::VortexDefault => {
+            let start = Instant::now();
             let array = BtrBlocksCompressor::default().compress(&prepared.uncompressed)?;
-            let size = measure_on_disk_size(&array, session, &prepared.name, "default").await?;
-            Ok((array, size))
+            let compress_duration = start.elapsed();
+            let nbytes = array.nbytes();
+            Ok(PreparedVariant {
+                array,
+                nbytes,
+                compress_duration,
+            })
         }
         Variant::VortexTurboQuant => {
             let mut ctx = session.create_execution_ctx();
+            let start = Instant::now();
             let array = compress_turboquant(prepared.uncompressed.clone(), &mut ctx)?;
-            // TurboQuant cannot yet round-trip through a Vortex file (L2Denorm metadata
-            // serialization is not implemented). Report the in-memory `.nbytes()` footprint
-            // as a proxy. Document this in the benchmark output so consumers of the
-            // dashboard aren't misled.
-            let size = array.nbytes() as u64;
-            Ok((array, size))
+            let compress_duration = start.elapsed();
+            let nbytes = array.nbytes();
+            Ok(PreparedVariant {
+                array,
+                nbytes,
+                compress_duration,
+            })
         }
     }
 }
 
-/// Serialize a prepared Vortex array to a temporary `.vortex` file and return its length.
-/// This is what we report as the "compressed size" for serializable variants; it matches
-/// the semantics of `compress-bench` which reports the on-disk parquet/vortex file size.
-async fn measure_on_disk_size(
-    array: &ArrayRef,
-    session: &VortexSession,
-    dataset_name: &str,
-    variant_label: &str,
-) -> Result<u64> {
-    use vortex::file::WriteOptionsSessionExt;
-
-    let tmp_dir = std::env::temp_dir().join("vortex-vector-search-bench");
-    tokio::fs::create_dir_all(&tmp_dir).await?;
-    let tmp_path = tmp_dir.join(format!("{dataset_name}-{variant_label}.vortex"));
-
-    let mut file = tokio::fs::File::create(&tmp_path).await?;
-    session
-        .write_options()
-        .write(&mut file, array.clone().to_array_stream())
-        .await?;
-
-    let metadata = tokio::fs::metadata(&tmp_path).await?;
-    Ok(metadata.len())
-}
-
-/// Run the decode / cosine / filter microbenchmarks against a prepared variant array and
-/// return the best-of-`iterations` wall times for each measurement.
+/// Run the decompress / cosine / filter microbenchmarks against a prepared variant
+/// array and return the best-of-`iterations` wall times for each measurement.
 pub fn run_timings(
     variant_array: &ArrayRef,
     query: &[f32],
@@ -329,15 +355,15 @@ pub fn run_timings(
 ) -> Result<VariantTimings> {
     let _ = QueryLen::default; // touch the type alias so rustc doesn't warn
 
-    let mut decode = Duration::MAX;
+    let mut decompress = Duration::MAX;
     let mut cosine = Duration::MAX;
     let mut filter = Duration::MAX;
 
     for _ in 0..iterations {
         let mut ctx = session.create_execution_ctx();
         let start = Instant::now();
-        let decoded: FixedSizeListArray = decode_full_scan(variant_array, &mut ctx)?;
-        decode = decode.min(start.elapsed());
+        let decoded: FixedSizeListArray = decompress_full_scan(variant_array, &mut ctx)?;
+        decompress = decompress.min(start.elapsed());
         drop(decoded);
     }
 
@@ -358,7 +384,7 @@ pub fn run_timings(
     }
 
     Ok(VariantTimings {
-        decode,
+        decompress,
         cosine,
         filter,
     })
@@ -367,33 +393,54 @@ pub fn run_timings(
 /// Timing summary for one `(dataset, variant)` pair.
 #[derive(Debug, Clone, Copy)]
 pub struct VariantTimings {
-    /// Wall time for a full column decode.
-    pub decode: Duration,
-    /// Wall time for the cosine_similarity scalar-function execution.
+    /// Wall time to execute the variant's array tree back into a canonical
+    /// `FixedSizeListArray`. ~0 for [`Variant::VortexUncompressed`] (the tree is already
+    /// canonical), meaningful for the two compressed variants.
+    pub decompress: Duration,
+    /// Wall time for the cosine_similarity scalar-function execution over the whole
+    /// column (materialized into an `f32` [`PrimitiveArray`]).
     pub cosine: Duration,
-    /// Wall time for the full `Binary(Gt, [cosine, threshold])` expression.
+    /// Wall time for the full `Binary(Gt, [cosine, threshold])` expression executed
+    /// into a [`BoolArray`].
     pub filter: Duration,
 }
 
 /// Fully materialize the input column so the measurement captures *all* decompression
-/// work — the extension shell, the FSL storage, and the inner element buffer.
+/// work — the extension shell, the FSL storage, **and the inner f32 element buffer**.
+///
+/// Forcing the element buffer to materialize as a canonical `PrimitiveArray<f32>` is
+/// what distinguishes this from a no-op cache hit. Executing the `ExtensionArray` or
+/// `FixedSizeListArray` alone only unwraps the container shells — if the FSL's
+/// `elements` child is (e.g.) an `alprd` tree, the bit-unpacking is lazy and only
+/// happens when something reads the values. The `execute::<PrimitiveArray>` call below
+/// forces that read.
 ///
 /// For the Vortex-uncompressed variant this is cheap (bitwise copy / no-op). For
-/// BtrBlocks-default it includes FSL decompression. For TurboQuant it includes running
-/// the inverse SORF rotation + dictionary lookup through the scalar-fn pipeline.
-fn decode_full_scan(
+/// BtrBlocks-default it includes the ALP-RD decoding pass. For TurboQuant it includes
+/// running the inverse SORF rotation + dictionary lookup through the scalar-fn
+/// pipeline.
+pub fn decompress_full_scan(
     array: &ArrayRef,
     ctx: &mut vortex::array::ExecutionCtx,
 ) -> Result<FixedSizeListArray> {
     use vortex::array::arrays::ExtensionArray;
     use vortex::array::arrays::extension::ExtensionArrayExt;
+    use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt;
 
     let ext: ExtensionArray = array.clone().execute(ctx)?;
     let fsl: FixedSizeListArray = ext.storage_array().clone().execute(ctx)?;
+    // Force the element buffer all the way down to a canonical PrimitiveArray so the
+    // timing captures any lazy decode work (ALP-RD bit unpacking, dict lookups, SORF
+    // inverse rotation, etc.).
+    let elements: PrimitiveArray = fsl.elements().clone().execute(ctx)?;
+    drop(elements);
     Ok(fsl)
 }
 
-fn execute_cosine(
+/// Execute `CosineSimilarity(data, broadcast(query))` to a materialized `f32`
+/// [`PrimitiveArray`]. Shared between the timing loop and the correctness-verification
+/// path so both exercise the exact same expression tree.
+pub fn execute_cosine(
     data: &ArrayRef,
     query: &[f32],
     ctx: &mut vortex::array::ExecutionCtx,
@@ -494,24 +541,96 @@ mod tests {
             Variant::VortexDefault,
             Variant::VortexTurboQuant,
         ] {
-            let rt = tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()
-                .unwrap();
-            let (array, size) = rt
-                .block_on(prepare_variant(&prepared, variant, &SESSION))
-                .unwrap();
+            let prep = prepare_variant(&prepared, variant, &SESSION).unwrap();
             assert_eq!(
-                array.len(),
+                prep.array.len(),
                 num_rows,
                 "variant {variant:?} changed row count"
             );
-            assert!(size > 0, "variant {variant:?} reported zero size");
+            assert!(prep.nbytes > 0, "variant {variant:?} reported zero size");
 
-            let timings = run_timings(&array, &prepared.query, 2, &SESSION).unwrap();
-            assert!(timings.decode > Duration::ZERO);
+            let timings = run_timings(&prep.array, &prepared.query, 2, &SESSION).unwrap();
+            // TurboQuant + default must do real work; uncompressed's decompress is a
+            // no-op and can plausibly time as zero.
             assert!(timings.cosine > Duration::ZERO);
             assert!(timings.filter > Duration::ZERO);
         }
     }
+
+    /// The **uncompressed** variant's decompress pass must be a no-op (the tree is
+    /// already canonical), while TurboQuant must do real work. This is a regression
+    /// guard for a future change accidentally making the uncompressed variant take the
+    /// slow path.
+    #[test]
+    fn uncompressed_decompress_is_fast() {
+        let dim = 128u32;
+        let num_rows = 256usize;
+        let uncompressed = synthetic_vector(dim, num_rows, 0xDEADBEEF);
+
+        let prepared = PreparedDataset {
+            name: "synthetic".to_string(),
+            uncompressed,
+            query: vec![0.1f32; dim as usize],
+            parquet_bytes: 0,
+        };
+
+        let uncompressed_prep =
+            prepare_variant(&prepared, Variant::VortexUncompressed, &SESSION).unwrap();
+        let turboquant_prep =
+            prepare_variant(&prepared, Variant::VortexTurboQuant, &SESSION).unwrap();
+
+        let unc_timings =
+            run_timings(&uncompressed_prep.array, &prepared.query, 3, &SESSION).unwrap();
+        let tq_timings = run_timings(&turboquant_prep.array, &prepared.query, 3, &SESSION).unwrap();
+
+        // The uncompressed decompress should be at least an order of magnitude faster
+        // than TurboQuant's (usually many orders of magnitude). 5x is a loose lower
+        // bound that won't flake on a noisy CI runner.
+        assert!(
+            tq_timings.decompress > unc_timings.decompress * 5,
+            "expected TurboQuant decompress ({:?}) to be >5x uncompressed ({:?})",
+            tq_timings.decompress,
+            unc_timings.decompress
+        );
+    }
+
+    /// Diagnostic: print the in-memory tree shape for each variant so we can see
+    /// exactly what BtrBlocks and TurboQuant do to the FSL storage.
+    ///
+    /// Run with:
+    /// ```bash
+    /// cargo test -p vector-search-bench --release -- \
+    ///     --ignored --nocapture print_variant_trees
+    /// ```
+    #[test]
+    #[ignore]
+    #[expect(clippy::use_debug, reason = "human-readable diagnostic output")]
+    fn print_variant_trees() {
+        let dim = 768u32;
+        let num_rows = 500usize;
+        let uncompressed = synthetic_vector(dim, num_rows, 0xC0FFEE);
+
+        let prepared = PreparedDataset {
+            name: "synthetic".to_string(),
+            uncompressed,
+            query: vec![0.1f32; dim as usize],
+            parquet_bytes: 0,
+        };
+
+        for variant in [
+            Variant::VortexUncompressed,
+            Variant::VortexDefault,
+            Variant::VortexTurboQuant,
+        ] {
+            let prep = prepare_variant(&prepared, variant, &SESSION).unwrap();
+            println!("=== {variant:?} ===");
+            println!("  len              : {}", prep.array.len());
+            println!("  nbytes           : {}", prep.nbytes);
+            println!("  compress_duration: {:?}", prep.compress_duration);
+            println!(
+                "  encoding tree    : {}",
+                prep.array.display_tree_encodings_only()
+            );
+        }
+    }
 }
diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs
index 7d65e29598f..1e8b0643bc7 100644
--- a/benchmarks/vector-search-bench/src/main.rs
+++ b/benchmarks/vector-search-bench/src/main.rs
@@ -17,6 +17,7 @@
 use std::borrow::Cow;
 use std::path::PathBuf;
 
+use anyhow::Context;
 use anyhow::Result;
 use clap::Parser;
 use indicatif::ProgressBar;
@@ -28,6 +29,9 @@ use vector_search_bench::prepare_variant;
 use vector_search_bench::recall::DEFAULT_TOP_K;
 use vector_search_bench::recall::measure_recall_at_k;
 use vector_search_bench::run_timings;
+use vector_search_bench::verify::VerificationKind;
+use vector_search_bench::verify::compute_cosine_scores;
+use vector_search_bench::verify::verify_variant;
 use vortex_bench::Format;
 use vortex_bench::SESSION;
 use vortex_bench::create_output_writer;
@@ -168,8 +172,8 @@ async fn main() -> Result<()> {
 
     let mut timings: Vec<CompressionTimingMeasurement> = Vec::new();
     let mut sizes: Vec<CustomUnitMeasurement> = Vec::new();
-
     let mut recalls: Vec<CustomUnitMeasurement> = Vec::new();
+    let mut verification: Vec<CustomUnitMeasurement> = Vec::new();
 
     for dataset in &datasets {
         let prepared = prepare_dataset(dataset).await?;
@@ -180,11 +184,62 @@ async fn main() -> Result<()> {
             prepared.num_rows()
         );
 
+        // Ground-truth cosine scores for the verification query — the scores produced by
+        // the uncompressed Vortex scan. Every other variant (including the parquet
+        // hand-rolled loop) will be compared against this.
+        let baseline_scores =
+            compute_cosine_scores(&prepared.uncompressed, &prepared.query, &SESSION)
+                .context("compute ground-truth cosine scores for verification")?;
+        tracing::info!(
+            "computed {} ground-truth cosine scores for {}",
+            baseline_scores.len(),
+            prepared.name
+        );
+
         // Parquet-Arrow baseline. Emitted as a separate pseudo-variant with label
         // `parquet` / Format::Parquet so it shows up in dashboards next to the Vortex
-        // variants.
+        // variants. The parquet baseline uses a hand-rolled Rust cosine loop; it must
+        // match the Vortex cosine scores within lossless tolerance (f32 ULPs) because
+        // it's computing the same math on the same underlying f32 values.
         if run_parquet_baseline {
             let parquet_path = dataset.to_parquet_path().await?;
+            let baseline_data =
+                vector_search_bench::parquet_baseline::read_parquet_embedding_column(&parquet_path)
+                    .context("read parquet emb column for verification")?;
+            let parquet_scores = vector_search_bench::parquet_baseline::cosine_loop(
+                &baseline_data.elements,
+                baseline_data.num_rows,
+                baseline_data.dim,
+                &prepared.query,
+            );
+            let parquet_report = vector_search_bench::verify::verify_scores(
+                &baseline_scores,
+                &parquet_scores,
+                VerificationKind::Lossless,
+            );
+            if !parquet_report.passed {
+                anyhow::bail!(
+                    "parquet baseline correctness check failed on {}: \
+                     max_abs_diff={:.6}, mean_abs_diff={:.6}, tolerance={:.6}",
+                    prepared.name,
+                    parquet_report.max_abs_diff,
+                    parquet_report.mean_abs_diff,
+                    parquet_report.tolerance(),
+                );
+            }
+            tracing::info!(
+                "parquet/{} verification: max_abs_diff={:.2e}, mean_abs_diff={:.2e}",
+                prepared.name,
+                parquet_report.max_abs_diff,
+                parquet_report.mean_abs_diff,
+            );
+            verification.push(CustomUnitMeasurement {
+                name: format!("correctness-max-diff/parquet/{}", prepared.name),
+                format: Format::Parquet,
+                unit: Cow::from("abs-diff"),
+                value: parquet_report.max_abs_diff,
+            });
+
             let baseline_timings = run_parquet_baseline_timings(
                 &parquet_path,
                 &prepared.query,
@@ -202,9 +257,9 @@ async fn main() -> Result<()> {
                 value: prepared.parquet_bytes as f64,
             });
             timings.push(CompressionTimingMeasurement {
-                name: format!("decode time/{bench_name}"),
+                name: format!("decompress time/{bench_name}"),
                 format: Format::Parquet,
-                time: baseline_timings.decode,
+                time: baseline_timings.decompress,
             });
             timings.push(CompressionTimingMeasurement {
                 name: format!("cosine-similarity time/{bench_name}"),
@@ -219,25 +274,67 @@ async fn main() -> Result<()> {
         }
 
         for &variant in &variants {
-            let (variant_array, size_bytes) = prepare_variant(&prepared, variant, &SESSION).await?;
+            let prep = prepare_variant(&prepared, variant, &SESSION)?;
 
             let variant_label = variant.label();
             let bench_name = format!("{variant_label}/{}", prepared.name);
 
+            // Correctness verification BEFORE timing. Lossless variants must match
+            // the uncompressed baseline within f32 noise; TurboQuant must stay within
+            // its lossy tolerance. A failure bails the whole run — you cannot publish
+            // throughput numbers for an encoding that returns wrong answers.
+            let kind = if variant == Variant::VortexTurboQuant {
+                VerificationKind::Lossy
+            } else {
+                VerificationKind::Lossless
+            };
+            let report = verify_variant(
+                &bench_name,
+                &prep.array,
+                &prepared.query,
+                &baseline_scores,
+                kind,
+                &SESSION,
+            )?;
+            tracing::info!(
+                "{} verification ({:?}): max_abs_diff={:.2e}, mean_abs_diff={:.2e}",
+                bench_name,
+                kind,
+                report.max_abs_diff,
+                report.mean_abs_diff,
+            );
+            verification.push(CustomUnitMeasurement {
+                name: format!("correctness-max-diff/{bench_name}"),
+                format: variant.as_format(),
+                unit: Cow::from("abs-diff"),
+                value: report.max_abs_diff,
+            });
+
+            // In-memory nbytes — the honest size of the variant tree we're executing.
             sizes.push(CustomUnitMeasurement {
-                name: format!("{variant_label} size/{}", prepared.name),
+                name: format!("{variant_label} nbytes/{}", prepared.name),
                 format: variant.as_format(),
                 unit: Cow::from("bytes"),
-                value: size_bytes as f64,
+                value: prep.nbytes as f64,
+            });
+
+            // Compress time — the wall time it takes to build the variant tree from
+            // the materialized uncompressed source. For the uncompressed variant
+            // itself this is ~0 (identity), so we still emit it as a measurement for
+            // dashboard consistency.
+            timings.push(CompressionTimingMeasurement {
+                name: format!("compress time/{bench_name}"),
+                format: variant.as_format(),
+                time: prep.compress_duration,
             });
 
             let variant_timings =
-                run_timings(&variant_array, &prepared.query, args.iterations, &SESSION)?;
+                run_timings(&prep.array, &prepared.query, args.iterations, &SESSION)?;
 
             timings.push(CompressionTimingMeasurement {
-                name: format!("decode time/{bench_name}"),
+                name: format!("decompress time/{bench_name}"),
                 format: variant.as_format(),
-                time: variant_timings.decode,
+                time: variant_timings.decompress,
             });
             timings.push(CompressionTimingMeasurement {
                 name: format!("cosine-similarity time/{bench_name}"),
@@ -251,12 +348,12 @@ async fn main() -> Result<()> {
             });
 
             // Recall@K quality measurement for lossy variants only. The lossless
-            // variants (uncompressed + BtrBlocks default) are trivially 1.0 against
-            // the uncompressed ground truth, so we skip them to avoid noise.
+            // variants are trivially 1.0 by construction (since they agree with the
+            // uncompressed baseline within 1e-4) so we skip them to keep noise down.
             if args.recall_queries > 0 && variant == Variant::VortexTurboQuant {
                 let recall = measure_recall_at_k(
                     &prepared.uncompressed,
-                    &variant_array,
+                    &prep.array,
                     args.recall_queries,
                     args.recall_k,
                     &SESSION,
@@ -295,11 +392,15 @@ async fn main() -> Result<()> {
                     recall.name, recall.value, recall.unit
                 )?;
             }
+            for check in &verification {
+                writeln!(writer, "{} {:.6e} {}", check.name, check.value, check.unit)?;
+            }
         }
         DisplayFormat::GhJson => {
             print_measurements_json(&mut writer, timings)?;
             print_measurements_json(&mut writer, sizes)?;
             print_measurements_json(&mut writer, recalls)?;
+            print_measurements_json(&mut writer, verification)?;
         }
     }
 
diff --git a/benchmarks/vector-search-bench/src/parquet_baseline.rs b/benchmarks/vector-search-bench/src/parquet_baseline.rs
index b80a2308370..222a125f362 100644
--- a/benchmarks/vector-search-bench/src/parquet_baseline.rs
+++ b/benchmarks/vector-search-bench/src/parquet_baseline.rs
@@ -172,14 +172,14 @@ pub fn run_parquet_baseline_timings(
     threshold: f32,
     iterations: usize,
 ) -> Result<VariantTimings> {
-    let mut decode = Duration::MAX;
+    let mut decompress = Duration::MAX;
     let mut cosine = Duration::MAX;
     let mut filter = Duration::MAX;
 
     for _ in 0..iterations {
         let start = Instant::now();
         let data = read_parquet_embedding_column(parquet_path)?;
-        decode = decode.min(start.elapsed());
+        decompress = decompress.min(start.elapsed());
 
         let start = Instant::now();
         let scores = cosine_loop(&data.elements, data.num_rows, data.dim, query);
@@ -193,7 +193,7 @@ pub fn run_parquet_baseline_timings(
     }
 
     Ok(VariantTimings {
-        decode,
+        decompress,
         cosine,
         filter,
     })
diff --git a/benchmarks/vector-search-bench/src/recall.rs b/benchmarks/vector-search-bench/src/recall.rs
index 15132561b4d..1c587616d4e 100644
--- a/benchmarks/vector-search-bench/src/recall.rs
+++ b/benchmarks/vector-search-bench/src/recall.rs
@@ -16,15 +16,13 @@
 
 use anyhow::Result;
 use vortex::array::ArrayRef;
-use vortex::array::IntoArray;
 use vortex::array::VortexSessionExecute;
 use vortex::array::arrays::PrimitiveArray;
 use vortex::array::arrays::extension::ExtensionArrayExt;
-use vortex::error::VortexExpect;
 use vortex::session::VortexSession;
 use vortex::utils::aliases::hash_set::HashSet;
-use vortex_tensor::scalar_fns::cosine_similarity::CosineSimilarity;
-use vortex_tensor::vector_search::build_constant_query_vector;
+
+use crate::verify::compute_cosine_scores;
 
 /// Size of the neighbour set we compare. 10 is the standard VectorDBBench default.
 pub const DEFAULT_TOP_K: usize = 10;
@@ -64,10 +62,10 @@ pub fn measure_recall_at_k(
         let row = (q * step).min(num_rows - 1);
         let query = extract_query_row(uncompressed, row, session)?;
 
-        let gt_scores = score_all_rows(uncompressed, &query, session)?;
+        let gt_scores = compute_cosine_scores(uncompressed, &query, session)?;
         let truth = top_k_indices(&gt_scores, top_k);
 
-        let lossy_scores = score_all_rows(compressed, &query, session)?;
+        let lossy_scores = compute_cosine_scores(compressed, &query, session)?;
         let lossy = top_k_indices(&lossy_scores, top_k);
 
         let truth_set: HashSet<usize> = truth.iter().copied().collect();
@@ -105,18 +103,6 @@ fn extract_query_row(
     Ok(slice[start..start + dim_usize].to_vec())
 }
 
-fn score_all_rows(data: &ArrayRef, query: &[f32], session: &VortexSession) -> Result<Vec<f32>> {
-    let num_rows = data.len();
-    let query_vec = build_constant_query_vector(query, num_rows)?;
-    let cosine = CosineSimilarity::try_new_array(data.clone(), query_vec, num_rows)
-        .vortex_expect("cosine similarity accepts matching Vector inputs")
-        .into_array();
-
-    let mut ctx = session.create_execution_ctx();
-    let scores: PrimitiveArray = cosine.execute(&mut ctx)?;
-    Ok(scores.as_slice::<f32>().to_vec())
-}
-
 /// Return the indices of the top-K highest scores, stable-sorted descending.
 fn top_k_indices(scores: &[f32], top_k: usize) -> Vec<usize> {
     let mut idx: Vec<usize> = (0..scores.len()).collect();
@@ -164,22 +150,12 @@ mod tests {
             parquet_bytes: 0,
         };
 
-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .unwrap();
-        let (tq_array, _) = rt
-            .block_on(prepare_variant(
-                &prepared,
-                Variant::VortexTurboQuant,
-                &SESSION,
-            ))
-            .unwrap();
+        let tq_prep = prepare_variant(&prepared, Variant::VortexTurboQuant, &SESSION).unwrap();
 
         // With only 64 random rows, recall@10 won't be 1.0 but it should be well
         // above chance (10/64 ≈ 0.156). The test asserts a loose lower bound to catch
         // total regressions without being flaky on distribution noise.
-        let recall = measure_recall_at_k(&uncompressed, &tq_array, 4, 10, &SESSION).unwrap();
+        let recall = measure_recall_at_k(&uncompressed, &tq_prep.array, 4, 10, &SESSION).unwrap();
         assert!(
             recall >= 0.3,
             "TurboQuant recall@10 on 64×128 synthetic data should be ≥0.3, got {recall}",
diff --git a/benchmarks/vector-search-bench/src/verify.rs b/benchmarks/vector-search-bench/src/verify.rs
new file mode 100644
index 00000000000..7dd6a564f36
--- /dev/null
+++ b/benchmarks/vector-search-bench/src/verify.rs
@@ -0,0 +1,344 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Correctness verification for vector-search variants.
+//!
+//! Before the timing loop runs, we compute cosine-similarity scores for a single query
+//! row against the uncompressed baseline and against each prepared variant, then compare
+//! the two score vectors element-by-element. This catches two distinct classes of bug:
+//!
+//! - A **lossless variant** that disagrees with the uncompressed scan (bug in the
+//!   compression pipeline, or in how we're routing through the scalar-fn dispatch, or in
+//!   the variant-specific decompress path).
+//! - A **lossy variant** (TurboQuant) that drifts further from ground truth than we
+//!   expect from the bit-width and SORF rotation settings (regression in the encoder).
+//!
+//! The same `execute_cosine` function the timing loop uses is also what verification
+//! uses, so the correctness check is validating the *exact* expression tree we're about
+//! to benchmark. Lossless variants must match within [`LOSSLESS_TOLERANCE`]; lossy
+//! variants must match within [`LOSSY_TOLERANCE`]. A hard-stop `Err` return on any
+//! mismatch keeps the benchmark honest — you cannot publish throughput numbers for a
+//! variant that's returning garbage.
+
+use anyhow::Result;
+use anyhow::bail;
+use vortex::array::ArrayRef;
+use vortex::array::VortexSessionExecute;
+use vortex::session::VortexSession;
+
+use crate::execute_cosine;
+
+/// Maximum acceptable absolute difference in cosine scores for a *lossless* variant
+/// (uncompressed, BtrBlocks-default). `cosine_similarity` traverses the FSL storage and
+/// reduces with f32 accumulators, so a pure algebraic change of encoding can shift a
+/// score by a few ULPs of f32 precision. `1e-4` is well above that noise floor while
+/// still catching real regressions.
+pub const LOSSLESS_TOLERANCE: f32 = 1e-4;
+
+/// Maximum acceptable absolute difference in cosine scores for the *lossy* TurboQuant
+/// variant. At the default 8-bit configuration the reconstructed dot product typically
+/// drifts by well under 0.05 for unit-normalized vectors. `0.2` is a loose upper bound
+/// that catches regressions without flaking on distribution-specific noise.
+pub const LOSSY_TOLERANCE: f32 = 0.2;
+
+/// Row index used to pick the verification query. Row 0 is also what
+/// [`crate::DEFAULT_QUERY_ROW`] selects, so the verification and timing paths exercise
+/// identical inputs.
+pub const VERIFICATION_QUERY_ROW: usize = 0;
+
+/// How lossy a variant is allowed to be when its scores are compared to the
+/// uncompressed baseline.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum VerificationKind {
+    /// Lossless variants must match within [`LOSSLESS_TOLERANCE`].
+    Lossless,
+    /// Lossy variants must match within [`LOSSY_TOLERANCE`].
+    Lossy,
+}
+
+/// Per-variant correctness report. Captured for both pass and fail outcomes so the
+/// caller can emit the numbers as dashboard measurements regardless.
+#[derive(Debug, Clone, Copy)]
+pub struct VerificationReport {
+    /// Number of rows compared (== dataset row count).
+    pub num_scores: usize,
+    /// Mean absolute difference between baseline and variant cosine scores.
+    pub mean_abs_diff: f64,
+    /// Max absolute difference between baseline and variant cosine scores.
+    pub max_abs_diff: f64,
+    /// Which tolerance band applied.
+    pub kind: VerificationKind,
+    /// Whether the variant's max-abs-diff stayed within its tolerance.
+    pub passed: bool,
+}
+
+impl VerificationReport {
+    /// The tolerance that was applied to produce [`Self::passed`].
+    pub fn tolerance(&self) -> f32 {
+        match self.kind {
+            VerificationKind::Lossless => LOSSLESS_TOLERANCE,
+            VerificationKind::Lossy => LOSSY_TOLERANCE,
+        }
+    }
+}
+
+/// Compute cosine-similarity scores for a single query row on `data` and return them
+/// as a plain `Vec<f32>`. This is just a convenience wrapper around
+/// [`crate::execute_cosine`] that pulls the f32 slice out of the resulting
+/// `PrimitiveArray`.
+pub fn compute_cosine_scores(
+    data: &ArrayRef,
+    query: &[f32],
+    session: &VortexSession,
+) -> Result<Vec<f32>> {
+    let mut ctx = session.create_execution_ctx();
+    let scores = execute_cosine(data, query, &mut ctx)?;
+    Ok(scores.as_slice::<f32>().to_vec())
+}
+
+/// Compare two equal-length score vectors and return their mean absolute difference
+/// and max absolute difference, without evaluating a pass/fail threshold.
+pub fn compare_scores(baseline: &[f32], other: &[f32]) -> (f64, f64) {
+    assert_eq!(
+        baseline.len(),
+        other.len(),
+        "compare_scores: length mismatch baseline={} other={}",
+        baseline.len(),
+        other.len(),
+    );
+
+    if baseline.is_empty() {
+        return (0.0, 0.0);
+    }
+
+    let mut sum = 0.0f64;
+    let mut max: f64 = 0.0;
+    for (&b, &o) in baseline.iter().zip(other.iter()) {
+        // Treat (+0, -0) pairs as equal and propagate NaN as inf so it always fails
+        // the tolerance check below.
+        let diff = if b.is_nan() || o.is_nan() {
+            f64::INFINITY
+        } else {
+            (f64::from(b) - f64::from(o)).abs()
+        };
+        sum += diff;
+        if diff > max {
+            max = diff;
+        }
+    }
+    (sum / baseline.len() as f64, max)
+}
+
+/// Verify one variant's scores against a baseline and produce a full
+/// [`VerificationReport`]. Whether `passed` is true depends on `kind`'s tolerance.
+pub fn verify_scores(
+    baseline: &[f32],
+    variant_scores: &[f32],
+    kind: VerificationKind,
+) -> VerificationReport {
+    let (mean_abs_diff, max_abs_diff) = compare_scores(baseline, variant_scores);
+    let tolerance = match kind {
+        VerificationKind::Lossless => f64::from(LOSSLESS_TOLERANCE),
+        VerificationKind::Lossy => f64::from(LOSSY_TOLERANCE),
+    };
+    let passed = max_abs_diff <= tolerance;
+    VerificationReport {
+        num_scores: baseline.len(),
+        mean_abs_diff,
+        max_abs_diff,
+        kind,
+        passed,
+    }
+}
+
+/// End-to-end variant verification: executes cosine on `variant_array` against the
+/// same query used for the baseline and returns a [`VerificationReport`]. Returns
+/// `Err` if `kind` is [`VerificationKind::Lossless`] and the scores disagree beyond
+/// [`LOSSLESS_TOLERANCE`] — that indicates a real correctness bug, not a quality
+/// tradeoff.
+pub fn verify_variant(
+    variant_name: &str,
+    variant_array: &ArrayRef,
+    query: &[f32],
+    baseline_scores: &[f32],
+    kind: VerificationKind,
+    session: &VortexSession,
+) -> Result<VerificationReport> {
+    let scores = compute_cosine_scores(variant_array, query, session)?;
+    let report = verify_scores(baseline_scores, &scores, kind);
+
+    if !report.passed {
+        let message = format!(
+            "{variant_name} correctness check failed: max_abs_diff={:.6}, \
+             mean_abs_diff={:.6}, tolerance={:.6} ({:?})",
+            report.max_abs_diff,
+            report.mean_abs_diff,
+            report.tolerance(),
+            report.kind,
+        );
+        match kind {
+            VerificationKind::Lossless => bail!("{message}"),
+            VerificationKind::Lossy => {
+                tracing::warn!("{message}");
+            }
+        }
+    }
+
+    Ok(report)
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_bench::SESSION;
+
+    use super::*;
+    use crate::Variant;
+    use crate::prepare_variant;
+    use crate::test_utils::synthetic_vector;
+
+    fn make_prepared(dim: u32, num_rows: usize, seed: u64) -> crate::PreparedDataset {
+        let uncompressed = synthetic_vector(dim, num_rows, seed);
+        crate::PreparedDataset {
+            name: "synthetic".to_string(),
+            uncompressed,
+            // Filled in below from row 0.
+            query: vec![],
+            parquet_bytes: 0,
+        }
+    }
+
+    fn extract_row_zero(uncompressed: &ArrayRef, dim: u32) -> Vec<f32> {
+        use vortex::array::VortexSessionExecute;
+        use vortex::array::arrays::Extension;
+        use vortex::array::arrays::FixedSizeListArray;
+        use vortex::array::arrays::PrimitiveArray;
+        use vortex::array::arrays::extension::ExtensionArrayExt;
+        use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt;
+
+        let mut ctx = SESSION.create_execution_ctx();
+        let ext = uncompressed.as_opt::<Extension>().unwrap();
+        let fsl: FixedSizeListArray = ext.storage_array().clone().execute(&mut ctx).unwrap();
+        let elements: PrimitiveArray = fsl.elements().clone().execute(&mut ctx).unwrap();
+        elements.as_slice::<f32>()[..dim as usize].to_vec()
+    }
+
+    #[test]
+    fn compare_scores_handles_empty() {
+        let (mean, max) = compare_scores(&[], &[]);
+        assert_eq!(mean, 0.0);
+        assert_eq!(max, 0.0);
+    }
+
+    #[test]
+    fn compare_scores_computes_mae_and_max() {
+        let base = [0.0f32, 1.0, 2.0, 3.0];
+        let other = [0.0f32, 1.0, 2.5, 3.0];
+        let (mean, max) = compare_scores(&base, &other);
+        assert!((max - 0.5).abs() < 1e-9);
+        assert!((mean - 0.125).abs() < 1e-9);
+    }
+
+    #[test]
+    fn verify_scores_passes_for_identical_inputs() {
+        let base = [0.5f32; 10];
+        let report = verify_scores(&base, &base, VerificationKind::Lossless);
+        assert!(report.passed);
+        assert_eq!(report.max_abs_diff, 0.0);
+        assert_eq!(report.mean_abs_diff, 0.0);
+        assert_eq!(report.num_scores, 10);
+    }
+
+    #[test]
+    fn verify_scores_fails_for_lossless_beyond_tolerance() {
+        let base = [0.5f32; 10];
+        let mut other = [0.5f32; 10];
+        other[3] = 0.50001; // diff ≈ 1e-5, comfortably below the 1e-4 lossless bound
+        let report_ok = verify_scores(&base, &other, VerificationKind::Lossless);
+        assert!(
+            report_ok.passed,
+            "1e-5 drift should pass, got max={:.2e}",
+            report_ok.max_abs_diff
+        );
+
+        other[3] = 0.51; // diff of 0.01, well above 1e-4
+        let report_bad = verify_scores(&base, &other, VerificationKind::Lossless);
+        assert!(
+            !report_bad.passed,
+            "1e-2 drift should fail, got max={:.2e}",
+            report_bad.max_abs_diff
+        );
+    }
+
+    #[test]
+    fn verify_scores_lossy_tolerates_small_drift() {
+        let base = [0.9f32; 10];
+        let mut other = [0.9f32; 10];
+        other[0] = 1.0; // diff of 0.1
+        let report = verify_scores(&base, &other, VerificationKind::Lossy);
+        assert!(
+            report.passed,
+            "0.1 drift should pass lossy tolerance, got max={}",
+            report.max_abs_diff
+        );
+    }
+
+    #[test]
+    fn verify_scores_fails_on_nan() {
+        let base = [0.5f32, 0.5];
+        let other = [0.5f32, f32::NAN];
+        let report = verify_scores(&base, &other, VerificationKind::Lossless);
+        assert!(!report.passed);
+        assert!(report.max_abs_diff.is_infinite());
+    }
+
+    #[test]
+    fn vortex_default_matches_uncompressed_end_to_end() {
+        let dim = 128u32;
+        let num_rows = 64usize;
+        let mut prepared = make_prepared(dim, num_rows, 0xC0FFEE);
+        prepared.query = extract_row_zero(&prepared.uncompressed, dim);
+
+        let baseline_scores =
+            compute_cosine_scores(&prepared.uncompressed, &prepared.query, &SESSION).unwrap();
+
+        let default_prep = prepare_variant(&prepared, Variant::VortexDefault, &SESSION).unwrap();
+        let report = verify_variant(
+            "vortex-default",
+            &default_prep.array,
+            &prepared.query,
+            &baseline_scores,
+            VerificationKind::Lossless,
+            &SESSION,
+        )
+        .expect("vortex-default must be lossless against the uncompressed baseline");
+        assert!(report.passed);
+    }
+
+    #[test]
+    fn vortex_turboquant_stays_within_lossy_tolerance() {
+        let dim = 128u32;
+        let num_rows = 64usize;
+        let mut prepared = make_prepared(dim, num_rows, 0xDEADBEEF);
+        prepared.query = extract_row_zero(&prepared.uncompressed, dim);
+
+        let baseline_scores =
+            compute_cosine_scores(&prepared.uncompressed, &prepared.query, &SESSION).unwrap();
+
+        let tq_prep = prepare_variant(&prepared, Variant::VortexTurboQuant, &SESSION).unwrap();
+        let report = verify_variant(
+            "vortex-turboquant",
+            &tq_prep.array,
+            &prepared.query,
+            &baseline_scores,
+            VerificationKind::Lossy,
+            &SESSION,
+        )
+        .expect("TurboQuant verification should not error");
+        assert!(
+            report.passed,
+            "TurboQuant drift {:.4} exceeds lossy tolerance {:.4}",
+            report.max_abs_diff,
+            report.tolerance()
+        );
+    }
+}

From 236e098b091aea37567b75385356478d0ff939fb Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 11 Apr 2026 14:40:52 +0000
Subject: [PATCH 10/18] =?UTF-8?q?vector-search-bench:=20rename=20"parquet"?=
 =?UTF-8?q?=20baseline=20=E2=86=92=20"handrolled"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The compute side of the baseline isn't really a parquet implementation —
it's a **hand-rolled Rust scalar cosine loop over a flat `Vec<f32>`**
that happens to have been decoded from a parquet file via
`parquet-rs` / `arrow-rs`. Labeling it as "parquet" in CLI flags,
metric names, and docs suggested the baseline was measuring a real
parquet-based analytical engine, when in reality it's the minimum cost
a Rust programmer could get away with given no query engine.

User-facing rename: everywhere the baseline was called "parquet" it now
says "handrolled" — the CLI format, the metric labels, the CI matrix
entry, the README, the type and function names in the baseline module,
and the module filename itself (`parquet_baseline.rs` →
`handrolled_baseline.rs`).

Unchanged:
- `read_parquet_embedding_column` keeps its name because it really does
  read parquet. Only the baseline-level wrappers take the `handrolled`
  label.
- `target.format` stays `Format::Parquet` — the storage side really is
  parquet on disk, and dashboards grouping by format should still put
  the baseline's size/decompress measurements under "parquet". The
  `CompressionTimingMeasurement::to_json` prefix therefore still reads
  `"parquet_rs-zstd "`, which honestly describes the parquet reader
  stack. Only the compute label in the metric `name` string reflects
  the hand-rolled compute path.

The module-level doc comment on `handrolled_baseline.rs` now explicitly
frames the baseline as a compute-cost *floor*, not a fair DBMS
comparison, and points at DuckDB `list_cosine_similarity` and
DataFusion vector UDFs as better "what real users pay for
parquet+cosine" baselines for future work.

Verified end-to-end against the 5000×768 synthetic cohere-small
fixture. All 13 existing tests still pass (the baseline test rename
from `parquet_baseline_reads_fsl_column` to
`handrolled_baseline_reads_fsl_column` is the only functional change to
tests). Correctness verification still reports:

  handrolled          max_abs_diff = 0.000e0 (bit-identical, lossless)
  vortex-uncompressed max_abs_diff = 0.000e0
  vortex-default      max_abs_diff = 0.000e0
  vortex-turboquant   max_abs_diff = 5.18e-3 (within 0.2 lossy tolerance)

Signed-off-by: Claude <noreply@anthropic.com>
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 .github/workflows/bench.yml                   |   2 +-
 benchmarks/vector-search-bench/README.md      | 109 +++++++++++++-----
 ...uet_baseline.rs => handrolled_baseline.rs} |  86 +++++++++-----
 benchmarks/vector-search-bench/src/lib.rs     |   2 +-
 benchmarks/vector-search-bench/src/main.rs    |  86 ++++++++------
 5 files changed, 188 insertions(+), 97 deletions(-)
 rename benchmarks/vector-search-bench/src/{parquet_baseline.rs => handrolled_baseline.rs} (69%)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index f700aac0a78..f900022e2f6 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -51,7 +51,7 @@ jobs:
           - id: vector-search-bench
             name: Vector Similarity Search
             build_args: ""
-            formats: "parquet,vortex-uncompressed,vortex-default,vortex-turboquant"
+            formats: "handrolled,vortex-uncompressed,vortex-default,vortex-turboquant"
     steps:
       - uses: runs-on/action@v2
         if: github.repository == 'vortex-data/vortex'
diff --git a/benchmarks/vector-search-bench/README.md b/benchmarks/vector-search-bench/README.md
index 7a10508910f..49f117e5c26 100644
--- a/benchmarks/vector-search-bench/README.md
+++ b/benchmarks/vector-search-bench/README.md
@@ -5,55 +5,88 @@ embedding corpora.
 
 ## What it measures
 
-For each `(dataset, format)` pair, the benchmark records four numbers:
-
-1. **Size** — compressed storage footprint in bytes. For the Vortex variants
-   that round-trip through `.vortex` files today (uncompressed & BtrBlocks
-   default) this is the real on-disk size. For `vortex-turboquant` it is
-   the in-memory `.nbytes()` footprint, because the `L2Denorm` scalar-fn
-   array does not yet have a concrete `serialize_metadata` implementation.
-2. **Full-scan decode time** — wall time to materialize the whole `Vector<dim, f32>`
-   column into a `FixedSizeListArray<f32>`.
-3. **Cosine-similarity execute time** — wall time for
-   `CosineSimilarity(data, const_query)` executed to a materialized f32 array.
-4. **Cosine-filter execute time** — wall time for the full
-   `Binary(Gt, [CosineSimilarity, threshold])` expression tree executed to
-   a `BoolArray`.
-
-The TurboQuant variant additionally reports **Recall@10** against the
-uncompressed Vortex scan as local ground truth. Lossless variants are trivially
-1.0 so they are not re-measured.
+For each `(dataset, format)` pair, the benchmark records:
+
+1. **`nbytes`** — in-memory footprint of the variant's array tree, in bytes.
+   Reporting the in-memory `.nbytes()` instead of an on-disk file size is
+   deliberate: the Vortex default write path runs BtrBlocks on every tree
+   regardless of whether it's already compressed, so "on-disk size" would
+   collapse `vortex-uncompressed` and `vortex-default` to the same bytes
+   even though their in-memory trees are different. The `nbytes()`
+   number is consistent with what the *compute* measurements actually
+   operate on.
+   - The `handrolled` baseline reports the canonical parquet file size
+     on disk — that's the only encoded representation it has.
+2. **Compress time** — wall time to build the variant tree from the
+   materialized uncompressed source. ~0 for `vortex-uncompressed` (identity),
+   meaningful for the two compressed variants.
+3. **Decompress time** — wall time to execute the variant tree all the way
+   back into a canonical `FixedSizeListArray<f32>` with a materialized f32
+   element buffer. For `vortex-uncompressed` this is a no-op; for
+   `vortex-default` it includes ALP-RD bit-unpacking; for
+   `vortex-turboquant` it includes the inverse SORF rotation and
+   dictionary lookup.
+4. **Cosine-similarity time** — `CosineSimilarity(data, const_query)`
+   executed to a materialized f32 array.
+5. **Cosine-filter time** — `Binary(Gt, [CosineSimilarity, threshold])`
+   executed to a `BoolArray`.
+6. **Recall@10** (TurboQuant only) — the fraction of the exact top-10
+   nearest neighbours that TurboQuant recovers, using the uncompressed
+   Vortex scan as local ground truth.
+
+Before any timing starts, the benchmark runs a **correctness verification
+pass**: cosine scores for a single query are computed against every
+variant and compared to the uncompressed baseline. Lossless variants must
+match within `1e-4` max-abs-diff; TurboQuant must stay within `0.2`. A
+mismatch bails the run — you cannot publish throughput numbers for a
+variant that returns wrong answers.
 
 ## Formats
 
-- `parquet` — Parquet file read via `parquet::arrow` into an Arrow
-  `FixedSizeListArray<f32>`, then a hand-rolled Rust cosine loop. This is the
-  "what you'd do without Vortex" external floor.
+- `handrolled` — Hand-rolled Rust scalar cosine loop over a flat
+  `Vec<f32>` that was decoded from the canonical parquet file via
+  `parquet-rs` / `arrow-rs`. The **decompress** phase does the parquet
+  read, downcasts to `Float32Array`, and memcpies into a plain `Vec<f32>`.
+  The **compute** phase is a plain scalar loop over `&[f32]` — no Arrow
+  compute kernels, no scalar-function dispatch, no SIMD annotations.
+
+  This is a **compute-cost floor**, not a realistic parquet-on-DBMS
+  baseline. It answers the question "what's the minimum cost you could
+  get away with if you wrote a vector-search scan by hand with no query
+  engine?" Real parquet users would pay substantially more (DuckDB
+  `list_cosine_similarity`, DataFusion with a vector UDF, etc.) —
+  adding those as additional baselines is a natural v2 direction.
 - `vortex-uncompressed` — Raw `Vector<dim, f32>` extension array, no
   encoding-level compression applied.
 - `vortex-default` — `BtrBlocksCompressor::default()` applied to the FSL
-  storage child. Generic lossless Vortex compression for float vectors.
+  storage child. On float vectors this typically finds ~15% lossless
+  savings via ALP-RD (mantissa/exponent split + bitpacking).
 - `vortex-turboquant` — The full
   `L2Denorm(SorfTransform(FSL(Dict(codes, centroids))), norms)` pipeline.
-  Lossy; recall@10 is reported alongside throughput.
+  Lossy; recall@10 is reported alongside throughput. At the default 8-bit
+  config this typically gives ~3× storage reduction at >90% top-10
+  recall.
 
 ## Datasets
 
-The first dataset wired up is **Cohere-100K** (`cohere-small`): 100K rows ×
-768 dims, cosine metric, ~150 MB zstd-parquet. This is the smallest
-VectorDBBench-supplied embedding corpus and sits comfortably inside a CI
-time / bandwidth budget.
+The smallest built-in dataset is **Cohere-100K** (`cohere-small`): 100K
+rows × 768 dims, cosine metric, ~150 MB zstd-parquet. It's the smallest
+VectorDBBench-supplied corpus that still exercises every encoding path.
+Larger variants (`cohere-medium`, `openai-small`, `openai-medium`,
+`bioasq-medium`, `glove-medium`) are wired up for local / on-demand
+experiments; see `vortex-bench/src/vector_dataset.rs` for the full list.
 
-The upstream URL is
-`https://assets.zilliz.com/benchmark/cohere_small_100k/train.parquet`. The
-public Zilliz bucket is anonymous-readable so the code _can_ hit it directly.
+The upstream URL for Cohere-100K is
+`https://assets.zilliz.com/benchmark/cohere_small_100k/train.parquet`.
+The public Zilliz bucket is anonymous-readable so the code can hit it
+directly.
 
 ## Running locally
 
 ```bash
 cargo run -p vector-search-bench --release -- \
     --datasets cohere-small \
-    --formats parquet,vortex-uncompressed,vortex-default,vortex-turboquant \
+    --formats handrolled,vortex-uncompressed,vortex-default,vortex-turboquant \
     --iterations 5 \
     -d table
 ```
@@ -62,6 +95,20 @@ The first run downloads the parquet file into
 `vortex-bench/data/cohere-small/cohere-small.parquet` and caches it
 idempotently for subsequent runs.
 
+### Running without network access
+
+The `gen_synthetic_dataset` helper writes a VectorDBBench-shape parquet
+file (`id: int64` + `emb: list<float32>`, zstd-compressed) at any path.
+Use it to populate the dataset cache so the benchmark's idempotent
+download step skips the HTTP fetch:
+
+```bash
+cargo run -p vector-search-bench --bin gen_synthetic_dataset --release -- \
+    --num-rows 5000 \
+    --dim 768 \
+    --out vortex-bench/data/cohere-small/cohere-small.parquet
+```
+
 ## CI note: dataset mirror
 
 CI runs after every develop-branch merge. Hitting `assets.zilliz.com`
diff --git a/benchmarks/vector-search-bench/src/parquet_baseline.rs b/benchmarks/vector-search-bench/src/handrolled_baseline.rs
similarity index 69%
rename from benchmarks/vector-search-bench/src/parquet_baseline.rs
rename to benchmarks/vector-search-bench/src/handrolled_baseline.rs
index 222a125f362..85267c358e2 100644
--- a/benchmarks/vector-search-bench/src/parquet_baseline.rs
+++ b/benchmarks/vector-search-bench/src/handrolled_baseline.rs
@@ -1,27 +1,42 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-//! Parquet-Arrow hand-rolled cosine similarity baseline.
+//! Hand-rolled Rust cosine similarity baseline.
 //!
-//! This module provides the "what you'd do without Vortex" floor for the vector-search
-//! benchmark. It reads the canonical parquet file for a dataset via `parquet::arrow`,
-//! decodes the `emb` column to an Arrow `FixedSizeListArray<f32>`, and then runs a
-//! straightforward Rust cosine-similarity loop — no scalar functions, no lazy expressions,
-//! no index.
+//! This module provides the *compute-cost floor* the other Vortex variants are measured
+//! against. It is **not** a realistic "parquet in a DBMS" baseline — it's the minimum
+//! amount of work a Rust programmer could get away with if they wrote a vector-search
+//! scan by hand with no query engine, no scalar-function dispatch, and no Arrow compute
+//! kernels.
 //!
-//! The four measurements produced mirror those of the Vortex variants so dashboards can
-//! put the parquet bar right next to the vortex bars:
+//! Two distinct phases run per iteration, and the benchmark times them separately so the
+//! dashboard can separate storage-read cost from compute cost:
 //!
-//! 1. Compressed size — the on-disk parquet file in bytes.
-//! 2. Full-scan decode time — parquet → arrow record batches → concatenated
-//!    `FixedSizeListArray<f32>`.
-//! 3. Cosine-similarity execute time — hand-rolled loop producing a `Vec<f32>` of scores.
-//! 4. Filter execute time — the same loop materializing into a `Vec<bool>` where
-//!    `score > threshold`.
+//! 1. **Decompress** ([`read_parquet_embedding_column`]) — reads the canonical parquet
+//!    file via `parquet-rs`, downcasts the `emb` column to an Arrow `Float32Array`, and
+//!    copies every value into a flat `Vec<f32>`. This phase is the only place Arrow is
+//!    actually used — only for the decode. The `memcpy` at the end is incidental: we
+//!    could operate directly on `Float32Array::values()` with identical performance,
+//!    but taking ownership of a `Vec<f32>` frees the Arrow `RecordBatch` lifetimes.
+//! 2. **Compute** ([`cosine_loop`] and [`filter_loop`]) — runs a plain scalar Rust loop
+//!    over `&[f32]`. Arrow is no longer involved. There's no SIMD, no unrolling
+//!    annotations, no dispatch overhead, no output-array allocation beyond a single
+//!    `Vec<f32>`. This is deliberately "the fastest you could possibly make it go
+//!    without writing SIMD intrinsics".
 //!
-//! This module does *not* include the parquet decode time in the cosine/filter wall
-//! times. Decoding is treated as its own measurement. This matches how the Vortex variants
-//! separate decode from compute.
+//! Calling this "the parquet baseline" would be misleading, because:
+//!
+//! - The compute layer has nothing to do with parquet — parquet is only the input
+//!   encoding, not the execution substrate.
+//! - Real parquet-on-DBMS engines (DuckDB's `list_cosine_similarity`, DataFusion with a
+//!   vector UDF, etc.) would pay substantial dispatch / planner / row-iterator cost
+//!   that this loop skips entirely.
+//!
+//! Think of it as: "If you didn't have Vortex and didn't feel like reaching for a query
+//! engine, what's the minimum scan cost you could get away with on this data?" That's
+//! the question this module answers, and it's intentionally a lower bound rather than a
+//! fair DBMS comparison. Future work could add DuckDB / DataFusion baselines alongside
+//! this one for the DBMS-level comparison.
 
 use std::fs::File;
 use std::path::Path;
@@ -43,8 +58,15 @@ use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
 use crate::VariantTimings;
 
 /// Read the entire `emb` column of a parquet file into a single flat `Vec<f32>`, along
-/// with the dimension and row count.
-pub fn read_parquet_embedding_column(parquet_path: &Path) -> Result<ParquetBaselineData> {
+/// with the dimension and row count. This is the *decompress* phase of the hand-rolled
+/// baseline — it's the only place Arrow is actually used. `parquet-rs` does the file
+/// decode, we downcast to `Float32Array`, and then memcpy into a plain `Vec<f32>` so
+/// the compute loop can operate over a raw slice without holding any Arrow
+/// `RecordBatch` references.
+///
+/// Kept under its `parquet` name because this function *actually reads parquet*; only
+/// the compute-side wrappers take the `handrolled` label.
+pub fn read_parquet_embedding_column(parquet_path: &Path) -> Result<HandrolledBaselineData> {
     let file = File::open(parquet_path)
         .with_context(|| format!("open parquet file {}", parquet_path.display()))?;
     let file_size = file.metadata()?.len();
@@ -88,7 +110,7 @@ pub fn read_parquet_embedding_column(parquet_path: &Path) -> Result<ParquetBasel
     }
 
     let dim = inferred_dim.context("parquet file has zero rows — cannot infer dimension")?;
-    Ok(ParquetBaselineData {
+    Ok(HandrolledBaselineData {
         elements: data,
         dim,
         num_rows,
@@ -150,23 +172,29 @@ fn maybe_set_dim(inferred_dim: &mut Option<usize>, new_dim: usize) -> Result<()>
     }
 }
 
-/// The flattened representation of a parquet file's embedding column, suitable for a
-/// hand-rolled distance loop.
-pub struct ParquetBaselineData {
+/// The flattened representation of an embedding column, suitable for a hand-rolled
+/// distance loop. Intentionally decoupled from any format — the compute side doesn't
+/// care how the data got into this `Vec<f32>`.
+pub struct HandrolledBaselineData {
     /// All rows concatenated: `elements.len() == num_rows * dim`.
     pub elements: Vec<f32>,
     /// Vector dimensionality.
     pub dim: usize,
     /// Number of rows.
     pub num_rows: usize,
-    /// On-disk size of the parquet file in bytes.
+    /// On-disk size of the parquet file in bytes. Reported as the "handrolled size"
+    /// measurement because parquet is the only format this benchmark serializes to
+    /// on disk today — a future v2 could report additional storage formats (raw f32
+    /// blob, Arrow IPC, etc.) alongside it.
     pub file_size: u64,
 }
 
-/// Run the decode / cosine / filter baseline microbenchmarks and return the best-of-N
-/// wall times. Decoding is re-parquet-reading from disk on each iteration (matches how
-/// the Vortex variants also re-execute from scratch each iteration).
-pub fn run_parquet_baseline_timings(
+/// Run the decompress / cosine / filter microbenchmarks for the hand-rolled baseline
+/// and return the best-of-N wall times. The decompress phase re-reads the parquet file
+/// from disk on each iteration (matches how the Vortex variants re-execute their tree
+/// from scratch each iteration), and the compute phase runs [`cosine_loop`] and
+/// [`filter_loop`] over the flat `Vec<f32>` the decompress phase produced.
+pub fn run_handrolled_baseline_timings(
     parquet_path: &Path,
     query: &[f32],
     threshold: f32,
@@ -283,7 +311,7 @@ mod tests {
     }
 
     #[test]
-    fn parquet_baseline_reads_fsl_column() {
+    fn handrolled_baseline_reads_fsl_column() {
         let file =
             write_tiny_fsl_parquet(3, &[&[1.0, 0.0, 0.0], &[0.0, 1.0, 0.0], &[1.0, 0.0, 0.0]])
                 .unwrap();
diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs
index fe27400dce6..9f61f62484f 100644
--- a/benchmarks/vector-search-bench/src/lib.rs
+++ b/benchmarks/vector-search-bench/src/lib.rs
@@ -32,7 +32,7 @@
 use std::time::Duration;
 use std::time::Instant;
 
-pub mod parquet_baseline;
+pub mod handrolled_baseline;
 pub mod recall;
 pub mod verify;
 
diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs
index 1e8b0643bc7..4d675030f4f 100644
--- a/benchmarks/vector-search-bench/src/main.rs
+++ b/benchmarks/vector-search-bench/src/main.rs
@@ -9,10 +9,15 @@
 //! ```bash
 //! cargo run -p vector-search-bench --release -- \
 //!     --datasets cohere-small \
-//!     --variants vortex-uncompressed,vortex-default,vortex-turboquant \
+//!     --formats handrolled,vortex-uncompressed,vortex-default,vortex-turboquant \
 //!     --iterations 5 \
 //!     -d table
 //! ```
+//!
+//! The `handrolled` variant is a hand-rolled Rust scalar cosine loop over a flat
+//! `Vec<f32>` decoded from the dataset's canonical parquet file; it is a compute-cost
+//! floor, not a realistic parquet-on-DBMS baseline. See
+//! [`handrolled_baseline`](vector_search_bench::handrolled_baseline) for details.
 
 use std::borrow::Cow;
 use std::path::PathBuf;
@@ -23,7 +28,7 @@ use clap::Parser;
 use indicatif::ProgressBar;
 use vector_search_bench::DEFAULT_THRESHOLD;
 use vector_search_bench::Variant;
-use vector_search_bench::parquet_baseline::run_parquet_baseline_timings;
+use vector_search_bench::handrolled_baseline::run_handrolled_baseline_timings;
 use vector_search_bench::prepare_dataset;
 use vector_search_bench::prepare_variant;
 use vector_search_bench::recall::DEFAULT_TOP_K;
@@ -61,9 +66,9 @@ struct Args {
     /// Which benchmark variants to run, using kebab-cased labels. The `--formats` name is
     /// used (instead of `--variants`) so this benchmark matches the CI invocation
     /// convention shared across random-access-bench / compress-bench. Accepted values:
-    /// `parquet`, `vortex-uncompressed`, `vortex-default`, `vortex-turboquant`. Defaults
-    /// to running all four.
-    #[arg(long, value_delimiter = ',', value_enum, default_values_t = vec![SelectableFormat::Parquet, SelectableFormat::VortexUncompressed, SelectableFormat::VortexDefault, SelectableFormat::VortexTurboQuant])]
+    /// `handrolled`, `vortex-uncompressed`, `vortex-default`, `vortex-turboquant`.
+    /// Defaults to running all four.
+    #[arg(long, value_delimiter = ',', value_enum, default_values_t = vec![SelectableFormat::Handrolled, SelectableFormat::VortexUncompressed, SelectableFormat::VortexDefault, SelectableFormat::VortexTurboQuant])]
     formats: Vec<SelectableFormat>,
 
     /// Number of query rows sampled when computing Recall@K for TurboQuant. 0 disables
@@ -123,9 +128,12 @@ impl SelectableDataset {
 
 #[derive(clap::ValueEnum, Clone, Copy, Debug, PartialEq, Eq)]
 enum SelectableFormat {
-    /// Parquet-Arrow hand-rolled cosine loop baseline.
-    #[clap(name = "parquet")]
-    Parquet,
+    /// Hand-rolled Rust scalar cosine loop over a flat `Vec<f32>` decoded from the
+    /// canonical parquet file via `parquet-rs` / `arrow-rs`. Compute-cost floor —
+    /// not a realistic parquet-on-DBMS baseline. See
+    /// [`vector_search_bench::handrolled_baseline`].
+    #[clap(name = "handrolled")]
+    Handrolled,
     /// Raw `Vector<dim, f32>` with no encoding compression.
     #[clap(name = "vortex-uncompressed")]
     VortexUncompressed,
@@ -140,7 +148,7 @@ enum SelectableFormat {
 impl SelectableFormat {
     fn into_variant(self) -> Option<Variant> {
         match self {
-            SelectableFormat::Parquet => None,
+            SelectableFormat::Handrolled => None,
             SelectableFormat::VortexUncompressed => Some(Variant::VortexUncompressed),
             SelectableFormat::VortexDefault => Some(Variant::VortexDefault),
             SelectableFormat::VortexTurboQuant => Some(Variant::VortexTurboQuant),
@@ -160,7 +168,7 @@ async fn main() -> Result<()> {
         .map(SelectableDataset::into_dataset)
         .collect();
 
-    let run_parquet_baseline = args.formats.contains(&SelectableFormat::Parquet);
+    let run_handrolled_baseline = args.formats.contains(&SelectableFormat::Handrolled);
     let variants: Vec<Variant> = args
         .formats
         .iter()
@@ -185,8 +193,8 @@ async fn main() -> Result<()> {
         );
 
         // Ground-truth cosine scores for the verification query — the scores produced by
-        // the uncompressed Vortex scan. Every other variant (including the parquet
-        // hand-rolled loop) will be compared against this.
+        // the uncompressed Vortex scan. Every other variant (including the hand-rolled
+        // baseline) will be compared against this.
         let baseline_scores =
             compute_cosine_scores(&prepared.uncompressed, &prepared.query, &SESSION)
                 .context("compute ground-truth cosine scores for verification")?;
@@ -196,58 +204,66 @@ async fn main() -> Result<()> {
             prepared.name
         );
 
-        // Parquet-Arrow baseline. Emitted as a separate pseudo-variant with label
-        // `parquet` / Format::Parquet so it shows up in dashboards next to the Vortex
-        // variants. The parquet baseline uses a hand-rolled Rust cosine loop; it must
-        // match the Vortex cosine scores within lossless tolerance (f32 ULPs) because
-        // it's computing the same math on the same underlying f32 values.
-        if run_parquet_baseline {
+        // Hand-rolled baseline. Emitted as a separate pseudo-variant with label
+        // `handrolled` so it shows up in dashboards next to the Vortex variants. This
+        // is a hand-rolled Rust scalar cosine loop over a flat `Vec<f32>` decoded from
+        // parquet via `parquet-rs`; it must match the Vortex cosine scores within the
+        // lossless tolerance (f32 ULPs) because it's computing the same math on the
+        // same underlying f32 values.
+        //
+        // `target.format` stays `Format::Parquet` because the *storage* side is still
+        // parquet on disk — only the *compute* is hand-rolled. The metric `name` field
+        // carries the `handrolled` label so human readers can tell the compute apart
+        // from, say, a DuckDB `list_cosine_similarity` baseline on the same parquet.
+        if run_handrolled_baseline {
             let parquet_path = dataset.to_parquet_path().await?;
             let baseline_data =
-                vector_search_bench::parquet_baseline::read_parquet_embedding_column(&parquet_path)
-                    .context("read parquet emb column for verification")?;
-            let parquet_scores = vector_search_bench::parquet_baseline::cosine_loop(
+                vector_search_bench::handrolled_baseline::read_parquet_embedding_column(
+                    &parquet_path,
+                )
+                .context("read parquet emb column for verification")?;
+            let handrolled_scores = vector_search_bench::handrolled_baseline::cosine_loop(
                 &baseline_data.elements,
                 baseline_data.num_rows,
                 baseline_data.dim,
                 &prepared.query,
             );
-            let parquet_report = vector_search_bench::verify::verify_scores(
+            let handrolled_report = vector_search_bench::verify::verify_scores(
                 &baseline_scores,
-                &parquet_scores,
+                &handrolled_scores,
                 VerificationKind::Lossless,
             );
-            if !parquet_report.passed {
+            if !handrolled_report.passed {
                 anyhow::bail!(
-                    "parquet baseline correctness check failed on {}: \
+                    "handrolled baseline correctness check failed on {}: \
                      max_abs_diff={:.6}, mean_abs_diff={:.6}, tolerance={:.6}",
                     prepared.name,
-                    parquet_report.max_abs_diff,
-                    parquet_report.mean_abs_diff,
-                    parquet_report.tolerance(),
+                    handrolled_report.max_abs_diff,
+                    handrolled_report.mean_abs_diff,
+                    handrolled_report.tolerance(),
                 );
             }
             tracing::info!(
-                "parquet/{} verification: max_abs_diff={:.2e}, mean_abs_diff={:.2e}",
+                "handrolled/{} verification: max_abs_diff={:.2e}, mean_abs_diff={:.2e}",
                 prepared.name,
-                parquet_report.max_abs_diff,
-                parquet_report.mean_abs_diff,
+                handrolled_report.max_abs_diff,
+                handrolled_report.mean_abs_diff,
             );
             verification.push(CustomUnitMeasurement {
-                name: format!("correctness-max-diff/parquet/{}", prepared.name),
+                name: format!("correctness-max-diff/handrolled/{}", prepared.name),
                 format: Format::Parquet,
                 unit: Cow::from("abs-diff"),
-                value: parquet_report.max_abs_diff,
+                value: handrolled_report.max_abs_diff,
             });
 
-            let baseline_timings = run_parquet_baseline_timings(
+            let baseline_timings = run_handrolled_baseline_timings(
                 &parquet_path,
                 &prepared.query,
                 DEFAULT_THRESHOLD,
                 args.iterations,
             )?;
 
-            let label = "parquet";
+            let label = "handrolled";
             let bench_name = format!("{label}/{}", prepared.name);
 
             sizes.push(CustomUnitMeasurement {

From d012eb1210b78090f5d6c9dde08ab56719bc72d8 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 11 Apr 2026 15:02:16 +0000
Subject: [PATCH 11/18] vector-search-bench: fix correctness bugs found in
 review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Review caught four real bugs plus a stylistic stray import. All are
small, targeted fixes.

1. `build_similarity_search_tree` (vortex-tensor) and `execute_cosine`
   (vector-search-bench) wrapped `CosineSimilarity::try_new_array` in
   `vortex_expect`, which panics on the `Err` branch. Both functions'
   doc comments claimed they *returned* errors on dimension mismatch,
   so the behavior contradicted the contract: a caller passing a
   wrong-dimension query vector would crash instead of seeing a clean
   error. Fixed by replacing `.vortex_expect(...)` with `?`.

2. The progress bar in `main.rs` was initialized with
   `datasets.len() * args.formats.len()` (which counts the handrolled
   baseline) but only `inc(1)`'d inside the Vortex-variants loop, so
   the bar finished at `N-1/N` whenever handrolled was selected. Fixed
   by incrementing inside the handrolled block and recomputing
   `total_work` from the actual increment sites.

3. `extract_query_row` blindly called `PrimitiveArray::as_slice::<f32>()`
   without checking the underlying ptype. The benchmark only ever uses
   f32 today, but the `Vector` extension type supports f16/f32/f64, so
   a future caller that accidentally passes an f64 `Vector` would get
   a mis-typed slice. Fixed by asserting `elements.ptype() == PType::F32`
   before the cast with a clear error message. While there, simplified
   the convoluted bounds check (was `row * dim + dim > len * dim`,
   reduces to `row >= len` without the multiplication).

4. `gen_synthetic_dataset.rs` and the README had shell examples using
   `--bin gen-synthetic-dataset` with hyphens, but cargo's default bin
   name is the filename minus extension — i.e., underscores. The
   example would fail to resolve. Fixed to `gen_synthetic_dataset`.

5. `use std::io::Write;` was stranded at the bottom of `main.rs` as a
   single-line import after the `main` function. Moved to the top
   import block for idiomatic style.

All 13 vector-search-bench tests still pass. 222 vortex-tensor tests
still pass. clippy and rustfmt clean. Public API unchanged — no
public-api.lock regen needed.

Signed-off-by: Claude <noreply@anthropic.com>
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 benchmarks/vector-search-bench/README.md      |  5 ++-
 .../src/bin/gen_synthetic_dataset.rs          |  5 ++-
 benchmarks/vector-search-bench/src/lib.rs     | 44 ++++++++++++-------
 benchmarks/vector-search-bench/src/main.rs    | 10 +++--
 vortex-tensor/src/vector_search.rs            |  5 +--
 5 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/benchmarks/vector-search-bench/README.md b/benchmarks/vector-search-bench/README.md
index 49f117e5c26..50af601ec2a 100644
--- a/benchmarks/vector-search-bench/README.md
+++ b/benchmarks/vector-search-bench/README.md
@@ -103,12 +103,15 @@ Use it to populate the dataset cache so the benchmark's idempotent
 download step skips the HTTP fetch:
 
 ```bash
-cargo run -p vector-search-bench --bin gen_synthetic_dataset --release -- \
+cargo run -p vector-search-bench --release --bin gen_synthetic_dataset -- \
     --num-rows 5000 \
     --dim 768 \
     --out vortex-bench/data/cohere-small/cohere-small.parquet
 ```
 
+(Cargo's default bin name is the filename minus extension, so underscores,
+not hyphens.)
+
 ## CI note: dataset mirror
 
 CI runs after every develop-branch merge. Hitting `assets.zilliz.com`
diff --git a/benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs b/benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs
index 3f5500edfcf..16e96cce5b4 100644
--- a/benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs
+++ b/benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs
@@ -12,14 +12,15 @@
 //! Example:
 //!
 //! ```bash
-//! cargo run -p vector-search-bench --bin gen-synthetic-dataset --release -- \
+//! cargo run -p vector-search-bench --bin gen_synthetic_dataset --release -- \
 //!     --num-rows 5000 \
 //!     --dim 768 \
 //!     --out vortex-bench/data/cohere-small/cohere-small.parquet
 //! ```
 //!
 //! After running this, `vector-search-bench --datasets cohere-small` will find the
-//! cached parquet file and skip the HTTP download via `idempotent_async`.
+//! cached parquet file and skip the HTTP download via `idempotent_async`. (Cargo's
+//! default bin name is the filename minus extension — underscores, not hyphens.)
 
 use std::fs::File;
 use std::path::PathBuf;
diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs
index 9f61f62484f..bfa0aa55c8d 100644
--- a/benchmarks/vector-search-bench/src/lib.rs
+++ b/benchmarks/vector-search-bench/src/lib.rs
@@ -46,7 +46,6 @@ use vortex::array::VortexSessionExecute;
 use vortex::array::arrays::BoolArray;
 use vortex::array::arrays::FixedSizeListArray;
 use vortex::array::arrays::PrimitiveArray;
-use vortex::error::VortexExpect;
 use vortex::session::VortexSession;
 use vortex_bench::Format;
 use vortex_bench::SESSION;
@@ -237,35 +236,44 @@ fn extract_emb_column(struct_array: &ArrayRef) -> Result<ArrayRef> {
 }
 
 /// Pull a single row out of a `Vector<dim, f32>` extension array as a plain `Vec<f32>`.
-fn extract_query_row(vector_ext: &ArrayRef, row: usize) -> Result<Vec<f32>> {
+///
+/// Only `f32`-typed `Vector` arrays are supported today — the benchmark deliberately
+/// restricts itself to `f32` vectors, so we assert the element type rather than
+/// quietly returning a mis-cast slice.
+pub(crate) fn extract_query_row(vector_ext: &ArrayRef, row: usize) -> Result<Vec<f32>> {
     use vortex::array::arrays::Extension;
     use vortex::array::arrays::extension::ExtensionArrayExt;
     use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt;
+    use vortex::dtype::PType;
 
-    let mut ctx = SESSION.create_execution_ctx();
+    if row >= vector_ext.len() {
+        bail!(
+            "query row {row} out of bounds for dataset of length {}",
+            vector_ext.len()
+        );
+    }
 
     let ext_view = vector_ext
         .as_opt::<Extension>()
         .context("prepared dataset must be a Vector extension array")?;
 
+    let mut ctx = SESSION.create_execution_ctx();
+
     // Execute storage array to its canonical FSL form.
     let fsl: FixedSizeListArray = ext_view.storage_array().clone().execute(&mut ctx)?;
 
-    let dim_usize = {
-        let vortex::dtype::DType::FixedSizeList(_, d, _) = fsl.dtype() else {
-            bail!("storage dtype must be FixedSizeList");
-        };
-        *d as usize
+    let dim_usize = match fsl.dtype() {
+        vortex::dtype::DType::FixedSizeList(_, d, _) => *d as usize,
+        other => bail!("storage dtype must be FixedSizeList, got {other}"),
     };
 
-    if row * dim_usize + dim_usize > vector_ext.len() * dim_usize {
+    let elements: PrimitiveArray = fsl.elements().clone().execute(&mut ctx)?;
+    if elements.ptype() != PType::F32 {
         bail!(
-            "query row {row} out of bounds for dataset of length {}",
-            vector_ext.len()
+            "extract_query_row currently only supports f32 Vector columns, got {:?}",
+            elements.ptype()
         );
     }
-
-    let elements: PrimitiveArray = fsl.elements().clone().execute(&mut ctx)?;
     let slice = elements.as_slice::<f32>();
     let start = row * dim_usize;
     Ok(slice[start..start + dim_usize].to_vec())
@@ -440,6 +448,12 @@ pub fn decompress_full_scan(
 /// Execute `CosineSimilarity(data, broadcast(query))` to a materialized `f32`
 /// [`PrimitiveArray`]. Shared between the timing loop and the correctness-verification
 /// path so both exercise the exact same expression tree.
+///
+/// # Errors
+///
+/// Returns an error if `data` is not a [`vortex_tensor::vector::Vector`] extension array,
+/// if `query`'s length doesn't match the database vector dimension, or if the execution
+/// context rejects the expression.
 pub fn execute_cosine(
     data: &ArrayRef,
     query: &[f32],
@@ -450,9 +464,7 @@ pub fn execute_cosine(
 
     let num_rows = data.len();
     let query_vec = build_constant_query_vector(query, num_rows)?;
-    let cosine = CosineSimilarity::try_new_array(data.clone(), query_vec, num_rows)
-        .vortex_expect("cosine similarity accepts matching Vector inputs")
-        .into_array();
+    let cosine = CosineSimilarity::try_new_array(data.clone(), query_vec, num_rows)?.into_array();
     Ok(cosine.execute(ctx)?)
 }
 
diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs
index 4d675030f4f..e6ad3ad8357 100644
--- a/benchmarks/vector-search-bench/src/main.rs
+++ b/benchmarks/vector-search-bench/src/main.rs
@@ -20,6 +20,7 @@
 //! [`handrolled_baseline`](vector_search_bench::handrolled_baseline) for details.
 
 use std::borrow::Cow;
+use std::io::Write;
 use std::path::PathBuf;
 
 use anyhow::Context;
@@ -175,7 +176,10 @@ async fn main() -> Result<()> {
         .filter_map(|f| f.into_variant())
         .collect();
 
-    let total_work = datasets.len() * args.formats.len();
+    // `args.formats.len()` counts both the handrolled baseline and the Vortex variants,
+    // so it matches the number of `progress.inc(1)` calls we'll make below (one per
+    // Vortex variant plus one per dataset for the handrolled path when it's enabled).
+    let total_work = datasets.len() * (variants.len() + usize::from(run_handrolled_baseline));
     let progress = ProgressBar::new(total_work as u64);
 
     let mut timings: Vec<CompressionTimingMeasurement> = Vec::new();
@@ -287,6 +291,8 @@ async fn main() -> Result<()> {
                 format: Format::Parquet,
                 time: baseline_timings.filter,
             });
+
+            progress.inc(1);
         }
 
         for &variant in &variants {
@@ -422,5 +428,3 @@ async fn main() -> Result<()> {
 
     Ok(())
 }
-
-use std::io::Write;
diff --git a/vortex-tensor/src/vector_search.rs b/vortex-tensor/src/vector_search.rs
index 37adbf27a61..6934fa52a71 100644
--- a/vortex-tensor/src/vector_search.rs
+++ b/vortex-tensor/src/vector_search.rs
@@ -57,7 +57,6 @@ use vortex_array::dtype::extension::ExtDType;
 use vortex_array::extension::EmptyMetadata;
 use vortex_array::scalar::Scalar;
 use vortex_array::scalar_fn::fns::operators::Operator;
-use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 use vortex_error::vortex_bail;
 
@@ -159,9 +158,7 @@ pub fn build_similarity_search_tree(
     let num_rows = data.len();
     let query_vec = build_constant_query_vector(query, num_rows)?;
 
-    let cosine = CosineSimilarity::try_new_array(data, query_vec, num_rows)
-        .vortex_expect("cosine similarity accepts two matching Vector extension arrays")
-        .into_array();
+    let cosine = CosineSimilarity::try_new_array(data, query_vec, num_rows)?.into_array();
 
     let threshold_scalar = Scalar::primitive(threshold, Nullability::NonNullable);
     let threshold_array = ConstantArray::new(threshold_scalar, num_rows).into_array();

From 00e88ea58324f88c9e82d428abc8252e6d6ca630 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 11 Apr 2026 15:04:59 +0000
Subject: [PATCH 12/18] vector-search-bench: remove dead code from review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cleanup of seven dead-or-suppressed items the review flagged:

1. `QueryLen` type alias (`lib.rs`) — defined but only "used" via
   `let _ = QueryLen::default;`, a warning-suppression hack. The doc
   comment claimed it made "broadcast semantics obvious at call sites"
   but no call site actually referenced it. Both the alias and the
   suppression `let` are gone.

2. `async-trait` and `itertools` Cargo dependencies — listed in
   `vector-search-bench/Cargo.toml` but not referenced anywhere in the
   crate's source tree (probably copied from `compress-bench` when the
   crate was bootstrapped). Removed.

3. `VERIFICATION_QUERY_ROW` constant (`verify.rs`) — defined as `0`
   and documented as "the row we verify against" but never referenced
   from anywhere; `main.rs` passes `prepared.query` directly, and
   `prepared.query` is extracted at `DEFAULT_QUERY_ROW` time inside
   `prepare_dataset`. Removed.

4. Dead-code-suppression hack in `vortex_bench::conversions::tests`:

       // ...suppress the dead-code warning by referencing them in a
       // no-op expression.
       let _ = (Nullability::NonNullable, PType::I32);

   Both imports were never used in the test body. Removed the imports
   and the dead let.

5. `let _ = emb_idx;` in `handrolled_baseline::read_parquet_embedding_column`.
   Replaced with destructuring via `let (_, emb_field) = ...`.

6. `HandrolledBaselineData::file_size` — populated by `read_parquet_
   embedding_column` via a second `File::metadata()` call, but no
   caller reads it: the benchmark's "handrolled size" measurement
   reads `PreparedDataset::parquet_bytes` (populated once in
   `prepare_dataset`). Keeping the duplicated state was a recipe for
   drift. Removed the field and its populating code, and added a doc
   comment on the struct pointing at `parquet_bytes` as the canonical
   file-size source.

All 13 vector-search-bench tests + 16 vortex-bench tests still pass.
clippy / rustfmt clean.

Signed-off-by: Claude <noreply@anthropic.com>
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 Cargo.lock                                       |  2 --
 benchmarks/vector-search-bench/Cargo.toml        |  2 --
 .../src/handrolled_baseline.rs                   | 16 +++++++---------
 benchmarks/vector-search-bench/src/lib.rs        |  7 -------
 benchmarks/vector-search-bench/src/verify.rs     |  5 -----
 vortex-bench/src/conversions.rs                  |  6 ------
 6 files changed, 7 insertions(+), 31 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ebc8530825a..aa4519e13ab 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10060,10 +10060,8 @@ dependencies = [
  "arrow-array 58.0.0",
  "arrow-buffer 58.0.0",
  "arrow-schema 58.0.0",
- "async-trait",
  "clap",
  "indicatif",
- "itertools 0.14.0",
  "parquet 58.0.0",
  "tempfile",
  "tokio",
diff --git a/benchmarks/vector-search-bench/Cargo.toml b/benchmarks/vector-search-bench/Cargo.toml
index 8e8101ddcee..bbc620572b5 100644
--- a/benchmarks/vector-search-bench/Cargo.toml
+++ b/benchmarks/vector-search-bench/Cargo.toml
@@ -19,10 +19,8 @@ anyhow = { workspace = true }
 arrow-array = { workspace = true }
 arrow-buffer = { workspace = true }
 arrow-schema = { workspace = true }
-async-trait = { workspace = true }
 clap = { workspace = true, features = ["derive"] }
 indicatif = { workspace = true }
-itertools = { workspace = true }
 parquet = { workspace = true }
 tokio = { workspace = true, features = ["full"] }
 tracing = { workspace = true }
diff --git a/benchmarks/vector-search-bench/src/handrolled_baseline.rs b/benchmarks/vector-search-bench/src/handrolled_baseline.rs
index 85267c358e2..260d4726857 100644
--- a/benchmarks/vector-search-bench/src/handrolled_baseline.rs
+++ b/benchmarks/vector-search-bench/src/handrolled_baseline.rs
@@ -69,11 +69,10 @@ use crate::VariantTimings;
 pub fn read_parquet_embedding_column(parquet_path: &Path) -> Result<HandrolledBaselineData> {
     let file = File::open(parquet_path)
         .with_context(|| format!("open parquet file {}", parquet_path.display()))?;
-    let file_size = file.metadata()?.len();
     let builder = ParquetRecordBatchReaderBuilder::try_new(file)?;
 
     // Locate the `emb` column and sanity-check its type.
-    let (emb_idx, emb_field) = builder
+    let (_, emb_field) = builder
         .schema()
         .column_with_name("emb")
         .context("parquet schema missing `emb` column")?;
@@ -93,7 +92,6 @@ pub fn read_parquet_embedding_column(parquet_path: &Path) -> Result<HandrolledBa
             element_dtype
         );
     }
-    let _ = emb_idx;
 
     let reader = builder.build()?;
     let batches: Vec<RecordBatch> = reader.collect::<Result<Vec<_>, _>>()?;
@@ -114,7 +112,6 @@ pub fn read_parquet_embedding_column(parquet_path: &Path) -> Result<HandrolledBa
         elements: data,
         dim,
         num_rows,
-        file_size,
     })
 }
 
@@ -175,6 +172,12 @@ fn maybe_set_dim(inferred_dim: &mut Option<usize>, new_dim: usize) -> Result<()>
 /// The flattened representation of an embedding column, suitable for a hand-rolled
 /// distance loop. Intentionally decoupled from any format — the compute side doesn't
 /// care how the data got into this `Vec<f32>`.
+///
+/// The benchmark's "size" measurement for the handrolled baseline comes from
+/// [`crate::PreparedDataset::parquet_bytes`] (which is populated once in
+/// [`crate::prepare_dataset`]), not from this struct. We deliberately don't carry
+/// the file size in here — doing so would duplicate state between two places that
+/// can go out of sync.
 pub struct HandrolledBaselineData {
     /// All rows concatenated: `elements.len() == num_rows * dim`.
     pub elements: Vec<f32>,
@@ -182,11 +185,6 @@ pub struct HandrolledBaselineData {
     pub dim: usize,
     /// Number of rows.
     pub num_rows: usize,
-    /// On-disk size of the parquet file in bytes. Reported as the "handrolled size"
-    /// measurement because parquet is the only format this benchmark serializes to
-    /// on disk today — a future v2 could report additional storage formats (raw f32
-    /// blob, Arrow IPC, etc.) alongside it.
-    pub file_size: u64,
 }
 
 /// Run the decompress / cosine / filter microbenchmarks for the hand-rolled baseline
diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs
index bfa0aa55c8d..7be88e7b734 100644
--- a/benchmarks/vector-search-bench/src/lib.rs
+++ b/benchmarks/vector-search-bench/src/lib.rs
@@ -112,11 +112,6 @@ impl Variant {
     }
 }
 
-/// Number of rows in the query vector — matches the database so `ScalarFnArray`'s
-/// equal-length contract is satisfied. This type alias exists to make the broadcast
-/// semantics obvious at call sites.
-type QueryLen = usize;
-
 /// A materialized Vortex array and its associated execution session / context.
 pub struct PreparedDataset {
     /// Name used in metric strings — usually the dataset's `Dataset::name()`.
@@ -361,8 +356,6 @@ pub fn run_timings(
     iterations: usize,
     session: &VortexSession,
 ) -> Result<VariantTimings> {
-    let _ = QueryLen::default; // touch the type alias so rustc doesn't warn
-
     let mut decompress = Duration::MAX;
     let mut cosine = Duration::MAX;
     let mut filter = Duration::MAX;
diff --git a/benchmarks/vector-search-bench/src/verify.rs b/benchmarks/vector-search-bench/src/verify.rs
index 7dd6a564f36..edac6d1fc27 100644
--- a/benchmarks/vector-search-bench/src/verify.rs
+++ b/benchmarks/vector-search-bench/src/verify.rs
@@ -41,11 +41,6 @@ pub const LOSSLESS_TOLERANCE: f32 = 1e-4;
 /// that catches regressions without flaking on distribution-specific noise.
 pub const LOSSY_TOLERANCE: f32 = 0.2;
 
-/// Row index used to pick the verification query. Row 0 is also what
-/// [`crate::DEFAULT_QUERY_ROW`] selects, so the verification and timing paths exercise
-/// identical inputs.
-pub const VERIFICATION_QUERY_ROW: usize = 0;
-
 /// How lossy a variant is allowed to be when its scores are compared to the
 /// uncompressed baseline.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
diff --git a/vortex-bench/src/conversions.rs b/vortex-bench/src/conversions.rs
index e31c3520d2f..d640f583ee9 100644
--- a/vortex-bench/src/conversions.rs
+++ b/vortex-bench/src/conversions.rs
@@ -372,8 +372,6 @@ mod tests {
     use vortex::array::validity::Validity;
     use vortex::buffer::BufferMut;
     use vortex::dtype::DType;
-    use vortex::dtype::Nullability;
-    use vortex::dtype::PType;
 
     use super::list_to_vector_ext;
 
@@ -463,9 +461,5 @@ mod tests {
             err.contains("element type must be float"),
             "unexpected error: {err}",
         );
-
-        // The unused Nullability / PType imports exist to make the intent clear; suppress
-        // the dead-code warning by referencing them in a no-op expression.
-        let _ = (Nullability::NonNullable, PType::I32);
     }
 }

From 3f026220bb9b5da50a5cbec0a08b7a02fd0b01ad Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 11 Apr 2026 15:08:18 +0000
Subject: [PATCH 13/18] vector-search-bench: dedupe helpers and consolidate
 verify paths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Four related cleanups the review flagged for modularity & clarity:

1. Deleted the duplicate `extract_query_row` helper from
   `recall.rs` (30 lines near-identical to the one in `lib.rs`). Both
   call sites now go through a single `pub(crate)` helper in `lib.rs`.
   The lib.rs helper also gets a fresh doc comment explicitly calling
   out its f32-only requirement (landed in the previous commit) so the
   deduped version is not just shorter but more correct.

2. Added `verify::verify_and_report_scores`: a score-vector-first
   variant of `verify_variant` that takes pre-computed scores as
   `&[f32]` instead of an `ArrayRef`. `verify_variant` now delegates
   to it. The point is to make both main-loop paths share a single
   error-handling / logging / bail-on-lossless codepath:
   - Vortex variants compute scores via `compute_cosine_scores(array)`
     and call `verify_variant`, which then calls
     `verify_and_report_scores` under the hood.
   - The handrolled baseline computes scores via `cosine_loop(&[f32])`
     and calls `verify_and_report_scores` directly.
   Previously `main.rs` hand-rolled the handrolled path's check/log/
   bail triplet, duplicating logic in two places that could drift.

3. Dropped the `compress_turboquant` and `build_similarity_search_tree`
   wrapper shims from `vortex-tensor/benches/similarity_search_common/
   mod.rs`. They existed only to re-expose the public helpers under
   local names for bench call-site compatibility. The bench now
   `pub use`s them directly, which is one fewer indirection layer.
   `compress_default` stays in the bench module because its
   `BtrBlocksCompressor` dependency lives in vortex-tensor's
   dev-dependencies, not main deps, so lifting it to the public
   `vector_search` module would pull `vortex-btrblocks` into
   vortex-tensor's main dep graph.

4. Replaced the stale "in future commits" doc comment on
   `PreparedDataset::uncompressed` — recall is already wired.

All 13 vector-search-bench + 222 vortex-tensor tests pass. clippy /
rustfmt clean.

Signed-off-by: Claude <noreply@anthropic.com>
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 benchmarks/vector-search-bench/src/lib.rs     |  8 ++--
 benchmarks/vector-search-bench/src/main.rs    | 43 ++++++++-----------
 benchmarks/vector-search-bench/src/recall.rs  | 33 +-------------
 benchmarks/vector-search-bench/src/verify.rs  | 41 +++++++++++++-----
 .../benches/similarity_search_common/mod.rs   | 27 +++---------
 5 files changed, 62 insertions(+), 90 deletions(-)

diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs
index 7be88e7b734..37cc3cb0ef2 100644
--- a/benchmarks/vector-search-bench/src/lib.rs
+++ b/benchmarks/vector-search-bench/src/lib.rs
@@ -116,12 +116,14 @@ impl Variant {
 pub struct PreparedDataset {
     /// Name used in metric strings — usually the dataset's `Dataset::name()`.
     pub name: String,
-    /// Uncompressed `Vector<dim, f32>` array (canonical form). This is reused as the
-    /// ground-truth basis for TurboQuant recall checks in future commits.
+    /// Uncompressed `Vector<dim, f32>` array (canonical form). Doubles as the
+    /// ground-truth basis for the correctness-verification pass and for TurboQuant's
+    /// Recall@K quality measurement.
     pub uncompressed: ArrayRef,
     /// The query vector to use (a single row pulled from the dataset).
     pub query: Vec<f32>,
-    /// Parquet file size on disk in bytes — produced by the dataset download step.
+    /// Parquet file size on disk in bytes — produced by the dataset download step
+    /// and reused as the "handrolled size" measurement in main.rs.
     pub parquet_bytes: u64,
 }
 
diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs
index e6ad3ad8357..7cae73cf823 100644
--- a/benchmarks/vector-search-bench/src/main.rs
+++ b/benchmarks/vector-search-bench/src/main.rs
@@ -29,6 +29,8 @@ use clap::Parser;
 use indicatif::ProgressBar;
 use vector_search_bench::DEFAULT_THRESHOLD;
 use vector_search_bench::Variant;
+use vector_search_bench::handrolled_baseline::cosine_loop;
+use vector_search_bench::handrolled_baseline::read_parquet_embedding_column;
 use vector_search_bench::handrolled_baseline::run_handrolled_baseline_timings;
 use vector_search_bench::prepare_dataset;
 use vector_search_bench::prepare_variant;
@@ -37,6 +39,7 @@ use vector_search_bench::recall::measure_recall_at_k;
 use vector_search_bench::run_timings;
 use vector_search_bench::verify::VerificationKind;
 use vector_search_bench::verify::compute_cosine_scores;
+use vector_search_bench::verify::verify_and_report_scores;
 use vector_search_bench::verify::verify_variant;
 use vortex_bench::Format;
 use vortex_bench::SESSION;
@@ -221,40 +224,35 @@ async fn main() -> Result<()> {
         // from, say, a DuckDB `list_cosine_similarity` baseline on the same parquet.
         if run_handrolled_baseline {
             let parquet_path = dataset.to_parquet_path().await?;
-            let baseline_data =
-                vector_search_bench::handrolled_baseline::read_parquet_embedding_column(
-                    &parquet_path,
-                )
+            let label = "handrolled";
+            let bench_name = format!("{label}/{}", prepared.name);
+
+            // Verify the handrolled cosine scores against the Vortex baseline before
+            // any timing starts. `verify_and_report_scores` is the same helper the
+            // Vortex-variant loop ends up calling through `verify_variant`, so the
+            // two paths share all their pass/fail / log / bail logic.
+            let baseline_data = read_parquet_embedding_column(&parquet_path)
                 .context("read parquet emb column for verification")?;
-            let handrolled_scores = vector_search_bench::handrolled_baseline::cosine_loop(
+            let handrolled_scores = cosine_loop(
                 &baseline_data.elements,
                 baseline_data.num_rows,
                 baseline_data.dim,
                 &prepared.query,
             );
-            let handrolled_report = vector_search_bench::verify::verify_scores(
-                &baseline_scores,
+            let handrolled_report = verify_and_report_scores(
+                &bench_name,
                 &handrolled_scores,
+                &baseline_scores,
                 VerificationKind::Lossless,
-            );
-            if !handrolled_report.passed {
-                anyhow::bail!(
-                    "handrolled baseline correctness check failed on {}: \
-                     max_abs_diff={:.6}, mean_abs_diff={:.6}, tolerance={:.6}",
-                    prepared.name,
-                    handrolled_report.max_abs_diff,
-                    handrolled_report.mean_abs_diff,
-                    handrolled_report.tolerance(),
-                );
-            }
+            )?;
             tracing::info!(
-                "handrolled/{} verification: max_abs_diff={:.2e}, mean_abs_diff={:.2e}",
-                prepared.name,
+                "{} verification (Lossless): max_abs_diff={:.2e}, mean_abs_diff={:.2e}",
+                bench_name,
                 handrolled_report.max_abs_diff,
                 handrolled_report.mean_abs_diff,
             );
             verification.push(CustomUnitMeasurement {
-                name: format!("correctness-max-diff/handrolled/{}", prepared.name),
+                name: format!("correctness-max-diff/{bench_name}"),
                 format: Format::Parquet,
                 unit: Cow::from("abs-diff"),
                 value: handrolled_report.max_abs_diff,
@@ -267,9 +265,6 @@ async fn main() -> Result<()> {
                 args.iterations,
             )?;
 
-            let label = "handrolled";
-            let bench_name = format!("{label}/{}", prepared.name);
-
             sizes.push(CustomUnitMeasurement {
                 name: format!("{label} size/{}", prepared.name),
                 format: Format::Parquet,
diff --git a/benchmarks/vector-search-bench/src/recall.rs b/benchmarks/vector-search-bench/src/recall.rs
index 1c587616d4e..c091b74305e 100644
--- a/benchmarks/vector-search-bench/src/recall.rs
+++ b/benchmarks/vector-search-bench/src/recall.rs
@@ -16,12 +16,10 @@
 
 use anyhow::Result;
 use vortex::array::ArrayRef;
-use vortex::array::VortexSessionExecute;
-use vortex::array::arrays::PrimitiveArray;
-use vortex::array::arrays::extension::ExtensionArrayExt;
 use vortex::session::VortexSession;
 use vortex::utils::aliases::hash_set::HashSet;
 
+use crate::extract_query_row;
 use crate::verify::compute_cosine_scores;
 
 /// Size of the neighbour set we compare. 10 is the standard VectorDBBench default.
@@ -60,7 +58,7 @@ pub fn measure_recall_at_k(
 
     for q in 0..num_queries {
         let row = (q * step).min(num_rows - 1);
-        let query = extract_query_row(uncompressed, row, session)?;
+        let query = extract_query_row(uncompressed, row)?;
 
         let gt_scores = compute_cosine_scores(uncompressed, &query, session)?;
         let truth = top_k_indices(&gt_scores, top_k);
@@ -76,33 +74,6 @@ pub fn measure_recall_at_k(
     Ok(total_hits as f64 / total_checked as f64)
 }
 
-fn extract_query_row(
-    vector_ext: &ArrayRef,
-    row: usize,
-    session: &VortexSession,
-) -> Result<Vec<f32>> {
-    use anyhow::Context;
-    use vortex::array::arrays::Extension;
-    use vortex::array::arrays::FixedSizeListArray;
-    use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt;
-
-    let mut ctx = session.create_execution_ctx();
-    let ext = vector_ext
-        .as_opt::<Extension>()
-        .context("extract_query_row expects an Extension<Vector> array")?;
-    let fsl: FixedSizeListArray = ext.storage_array().clone().execute(&mut ctx)?;
-
-    let dim_usize = match fsl.dtype() {
-        vortex::dtype::DType::FixedSizeList(_, dim, _) => *dim as usize,
-        other => anyhow::bail!("expected FixedSizeList storage, got {other}"),
-    };
-
-    let elements: PrimitiveArray = fsl.elements().clone().execute(&mut ctx)?;
-    let slice = elements.as_slice::<f32>();
-    let start = row * dim_usize;
-    Ok(slice[start..start + dim_usize].to_vec())
-}
-
 /// Return the indices of the top-K highest scores, stable-sorted descending.
 fn top_k_indices(scores: &[f32], top_k: usize) -> Vec<usize> {
     let mut idx: Vec<usize> = (0..scores.len()).collect();
diff --git a/benchmarks/vector-search-bench/src/verify.rs b/benchmarks/vector-search-bench/src/verify.rs
index edac6d1fc27..cbca8a3adc5 100644
--- a/benchmarks/vector-search-bench/src/verify.rs
+++ b/benchmarks/vector-search-bench/src/verify.rs
@@ -146,21 +146,23 @@ pub fn verify_scores(
     }
 }
 
-/// End-to-end variant verification: executes cosine on `variant_array` against the
-/// same query used for the baseline and returns a [`VerificationReport`]. Returns
-/// `Err` if `kind` is [`VerificationKind::Lossless`] and the scores disagree beyond
-/// [`LOSSLESS_TOLERANCE`] — that indicates a real correctness bug, not a quality
-/// tradeoff.
-pub fn verify_variant(
+/// Verify pre-computed scores against a baseline and enforce the tolerance band.
+///
+/// Takes already-materialized `variant_scores` (as a `&[f32]`) rather than an
+/// `ArrayRef`, so both the Vortex-variant path (which computes scores via
+/// [`execute_cosine`](crate::execute_cosine)) and the hand-rolled baseline path (which
+/// runs a plain Rust loop over a flat `Vec<f32>`) share the same error-handling,
+/// logging, and hard-fail logic without duplicating it in `main.rs`.
+///
+/// Lossless mismatches bail the run with an error; lossy mismatches log a warning
+/// but let the run continue so the recall measurement is still reported.
+pub fn verify_and_report_scores(
     variant_name: &str,
-    variant_array: &ArrayRef,
-    query: &[f32],
+    variant_scores: &[f32],
     baseline_scores: &[f32],
     kind: VerificationKind,
-    session: &VortexSession,
 ) -> Result<VerificationReport> {
-    let scores = compute_cosine_scores(variant_array, query, session)?;
-    let report = verify_scores(baseline_scores, &scores, kind);
+    let report = verify_scores(baseline_scores, variant_scores, kind);
 
     if !report.passed {
         let message = format!(
@@ -182,6 +184,23 @@ pub fn verify_variant(
     Ok(report)
 }
 
+/// End-to-end variant verification: executes cosine on `variant_array` against the
+/// same query used for the baseline and returns a [`VerificationReport`]. Returns
+/// `Err` if `kind` is [`VerificationKind::Lossless`] and the scores disagree beyond
+/// [`LOSSLESS_TOLERANCE`] — that indicates a real correctness bug, not a quality
+/// tradeoff.
+pub fn verify_variant(
+    variant_name: &str,
+    variant_array: &ArrayRef,
+    query: &[f32],
+    baseline_scores: &[f32],
+    kind: VerificationKind,
+    session: &VortexSession,
+) -> Result<VerificationReport> {
+    let scores = compute_cosine_scores(variant_array, query, session)?;
+    verify_and_report_scores(variant_name, &scores, baseline_scores, kind)
+}
+
 #[cfg(test)]
 mod tests {
     use vortex_bench::SESSION;
diff --git a/vortex-tensor/benches/similarity_search_common/mod.rs b/vortex-tensor/benches/similarity_search_common/mod.rs
index ee0bab128ed..b95867b26b7 100644
--- a/vortex-tensor/benches/similarity_search_common/mod.rs
+++ b/vortex-tensor/benches/similarity_search_common/mod.rs
@@ -47,8 +47,8 @@ use vortex_error::VortexResult;
 use vortex_error::vortex_panic;
 use vortex_session::VortexSession;
 use vortex_tensor::vector::Vector;
-use vortex_tensor::vector_search::build_similarity_search_tree as public_build_similarity_search_tree;
-use vortex_tensor::vector_search::compress_turboquant as public_compress_turboquant;
+pub use vortex_tensor::vector_search::build_similarity_search_tree;
+pub use vortex_tensor::vector_search::compress_turboquant;
 
 /// A shared [`VortexSession`] pre-loaded with the builtin [`ArraySession`] so both bench and
 /// example can create execution contexts cheaply.
@@ -141,17 +141,14 @@ pub fn extract_row_as_query(vectors: &ArrayRef, row: usize, dim: u32) -> Vec<f32
 /// underlying FSL storage child. TurboQuant is *not* exercised by this path -- it is not
 /// registered in the default scheme set -- so this measures "generic" lossless compression
 /// applied to float vectors.
+///
+/// Stays in this bench-only module because `BtrBlocksCompressor` is a dev-dependency of
+/// `vortex-tensor`, so promoting it to the public `vector_search` module would drag the
+/// `vortex-btrblocks` dep into `vortex-tensor`'s main dependency list.
 pub fn compress_default(data: ArrayRef) -> VortexResult<ArrayRef> {
     BtrBlocksCompressor::default().compress(&data)
 }
 
-/// Compresses a raw `Vector<dim, f32>` array with the TurboQuant pipeline. This is a thin
-/// wrapper around [`vortex_tensor::vector_search::compress_turboquant`] preserved for bench
-/// call-site compatibility.
-pub fn compress_turboquant(data: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef> {
-    public_compress_turboquant(data, ctx)
-}
-
 /// Dispatch helper that builds the data array for the requested [`Variant`], starting from a
 /// single random-vector generation. Always returns an `ArrayRef` whose logical dtype is
 /// `Vector<dim, f32>`.
@@ -169,15 +166,3 @@ pub fn build_variant(
         Variant::TurboQuant => compress_turboquant(raw, ctx),
     }
 }
-
-/// Build the lazy similarity-search array tree for a prepared data array and a single query
-/// vector. Thin wrapper around
-/// [`vortex_tensor::vector_search::build_similarity_search_tree`] preserved for bench
-/// call-site compatibility.
-pub fn build_similarity_search_tree(
-    data: ArrayRef,
-    query: &[f32],
-    threshold: f32,
-) -> VortexResult<ArrayRef> {
-    public_build_similarity_search_tree(data, query, threshold)
-}

From cbcb078d608c709cfcac80883d90f2cd1b99bf94 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 11 Apr 2026 15:09:40 +0000
Subject: [PATCH 14/18] vector-search-bench: optimize offset validation,
 interleave timings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two micro-refactors for correctness quality:

1. `list_to_vector_ext` offset validation walks `ListArrayExt::offset_at`
   N+1 times (once per row boundary) instead of 2N times (once per start
   and once per end of every row). Each iteration now carries the
   previous `end` into the next iteration's `prev_end`, so we only
   fetch each offset boundary once. For a canonical list produced by
   `parquet_to_vortex_chunks` the offsets are a `Primitive` array and
   `offset_at` takes the fast slice-index path, so this halves the
   offset lookups from ~200K to ~100K on a 100K-row column. The loop
   body is still O(1) per row either way.

2. `run_timings` now interleaves its decompress / cosine / filter
   stages inside a single outer loop over iterations, instead of
   running the three stages as back-to-back sequential loops. The
   back-to-back shape gives each stage an asymmetric cache profile —
   the second+ iterations of the cosine stage run on cache lines left
   behind by the previous *cosine* iteration, while the second+
   iterations of the filter stage run on cache lines left behind by
   the *cosine* stage. Interleaving makes each iteration of each stage
   see roughly the same cache state, removing the asymmetry.

Each stage still gets a fresh `ExecutionCtx` so no cached scalar-fn
state leaks between stages within a single iteration. The doc comment
on `run_timings` now spells out why interleaving matters.

All 13 vector-search-bench + 16 vortex-bench tests still pass.

Signed-off-by: Claude <noreply@anthropic.com>
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 benchmarks/vector-search-bench/src/lib.rs | 55 ++++++++++++++---------
 vortex-bench/src/conversions.rs           | 15 +++++--
 2 files changed, 45 insertions(+), 25 deletions(-)

diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs
index 37cc3cb0ef2..d5d5df1f1a2 100644
--- a/benchmarks/vector-search-bench/src/lib.rs
+++ b/benchmarks/vector-search-bench/src/lib.rs
@@ -352,6 +352,18 @@ pub fn prepare_variant(
 
 /// Run the decompress / cosine / filter microbenchmarks against a prepared variant
 /// array and return the best-of-`iterations` wall times for each measurement.
+///
+/// The three stages are **interleaved** inside a single outer loop rather than run
+/// as three separate back-to-back loops. Interleaving keeps each stage's cache /
+/// branch-predictor / allocator state symmetric across iterations — a pathology of
+/// the back-to-back shape is that iteration `N+1` of the cosine stage runs on
+/// warmed caches left behind by iteration `N` of the cosine stage, while iteration
+/// `N+1` of the filter stage runs on caches left behind by the *cosine* stage. The
+/// interleaved form makes each stage see roughly the same cache state every
+/// iteration.
+///
+/// Each stage still gets a fresh `ExecutionCtx`, so no cached scalar-fn state leaks
+/// between stages within a single iteration.
 pub fn run_timings(
     variant_array: &ArrayRef,
     query: &[f32],
@@ -363,27 +375,28 @@ pub fn run_timings(
     let mut filter = Duration::MAX;
 
     for _ in 0..iterations {
-        let mut ctx = session.create_execution_ctx();
-        let start = Instant::now();
-        let decoded: FixedSizeListArray = decompress_full_scan(variant_array, &mut ctx)?;
-        decompress = decompress.min(start.elapsed());
-        drop(decoded);
-    }
-
-    for _ in 0..iterations {
-        let mut ctx = session.create_execution_ctx();
-        let start = Instant::now();
-        let scores: PrimitiveArray = execute_cosine(variant_array, query, &mut ctx)?;
-        cosine = cosine.min(start.elapsed());
-        drop(scores);
-    }
-
-    for _ in 0..iterations {
-        let mut ctx = session.create_execution_ctx();
-        let start = Instant::now();
-        let matches: BoolArray = execute_filter(variant_array, query, DEFAULT_THRESHOLD, &mut ctx)?;
-        filter = filter.min(start.elapsed());
-        drop(matches);
+        {
+            let mut ctx = session.create_execution_ctx();
+            let start = Instant::now();
+            let decoded: FixedSizeListArray = decompress_full_scan(variant_array, &mut ctx)?;
+            decompress = decompress.min(start.elapsed());
+            drop(decoded);
+        }
+        {
+            let mut ctx = session.create_execution_ctx();
+            let start = Instant::now();
+            let scores: PrimitiveArray = execute_cosine(variant_array, query, &mut ctx)?;
+            cosine = cosine.min(start.elapsed());
+            drop(scores);
+        }
+        {
+            let mut ctx = session.create_execution_ctx();
+            let start = Instant::now();
+            let matches: BoolArray =
+                execute_filter(variant_array, query, DEFAULT_THRESHOLD, &mut ctx)?;
+            filter = filter.min(start.elapsed());
+            drop(matches);
+        }
     }
 
     Ok(VariantTimings {
diff --git a/vortex-bench/src/conversions.rs b/vortex-bench/src/conversions.rs
index d640f583ee9..1a486034919 100644
--- a/vortex-bench/src/conversions.rs
+++ b/vortex-bench/src/conversions.rs
@@ -317,19 +317,25 @@ pub fn list_to_vector_ext(input: ArrayRef) -> VortexResult<ArrayRef> {
         vortex_bail!("list_to_vector_ext: cannot infer vector dimension from empty input");
     }
 
-    let first_start = list.offset_at(0)?;
+    // Walk the offsets array once, reusing the previous iteration's `end` as the
+    // next iteration's `start`. Each `offset_at` call goes through
+    // `ListArrayExt::offset_at`, which has a fast path when the offsets child is a
+    // `Primitive` array (direct slice index). That's the common case after
+    // `parquet_to_vortex_chunks`, so for a 100K-row column we do ~100K primitive
+    // slice indexes rather than 200K. The loop body is O(1) either way.
+    let mut prev_end = list.offset_at(0)?;
     let first_end = list.offset_at(1)?;
-    let dim = first_end.checked_sub(first_start).ok_or_else(|| {
+    let dim = first_end.checked_sub(prev_end).ok_or_else(|| {
         vortex_err!("list_to_vector_ext: offsets are not monotonically increasing")
     })?;
     if dim == 0 {
         vortex_bail!("list_to_vector_ext: first row has zero elements");
     }
+    prev_end = first_end;
 
     for i in 1..num_rows {
-        let start = list.offset_at(i)?;
         let end = list.offset_at(i + 1)?;
-        let row_len = end.checked_sub(start).ok_or_else(|| {
+        let row_len = end.checked_sub(prev_end).ok_or_else(|| {
             vortex_err!("list_to_vector_ext: offsets are not monotonically increasing")
         })?;
         if row_len != dim {
@@ -340,6 +346,7 @@ pub fn list_to_vector_ext(input: ArrayRef) -> VortexResult<ArrayRef> {
                 dim
             );
         }
+        prev_end = end;
     }
 
     let elements = list.sliced_elements()?;

From 93082288be02233bef021d93e9000a4683f7778e Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 11 Apr 2026 15:12:31 +0000
Subject: [PATCH 15/18] vortex-bench: add ListView test, make
 VectorDataset::to_vortex_array bail
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two review items related to testing coverage and trait-contract
honesty:

1. `list_to_vector_ext` has five unit tests but none of them exercised
   the `ListView` → `List` fast path, even though that path is the one
   the benchmark actually hits after `parquet_to_vortex_chunks` (which
   canonicalizes list columns into `ListView`, not `List`). Added a
   `list_view_input_is_rewrapped_as_vector_extension` test with a
   `list_view_f32(dim, rows)` helper that constructs a
   `ListViewArray<f32>` with uniform-stride offsets and sizes, then
   asserts `list_to_vector_ext` returns an `Extension<Vector>` whose
   storage is a `FixedSizeList<f32, dim>`. A regression to the
   ListView handling would now be caught by unit tests rather than by
   the end-to-end benchmark smoke test.

   `vortex-bench` tests are now 17 (was 16).

2. `VectorDataset::to_vortex_array` previously implemented the
   `Dataset` trait by downloading the parquet file, writing it through
   the default Vortex write path, and reading it back. The returned
   array was a `StructArray` with `{ id: int64, emb: list<float> }` —
   **not** the `Extension<Vector>(FixedSizeList<...>)` shape the
   vector-search benchmark actually operates on. The benchmark worked
   around this by bypassing `to_vortex_array` entirely and rebuilding
   the Vector extension via `parquet_to_vortex_chunks` +
   `list_to_vector_ext`. But that left the trait method as a trap for
   future callers who expect it to return the same shape the
   benchmark measures.

   Replaced the body with `bail!` and a message that points callers at
   the parquet path + list_to_vector_ext sequence. This makes the
   contract unambiguous: either use the parquet path, or get a clear
   error instead of a semantically-wrong array.

   The unused imports (`tokio::fs::File`, `ArrayStreamExt`,
   `OpenOptionsSessionExt`, `WriteOptionsSessionExt`, `SESSION`,
   `parquet_to_vortex_chunks`, `idempotent_async`, `IntoArray`) are
   dropped too.

13 vector-search-bench + 17 vortex-bench tests pass.

Signed-off-by: Claude <noreply@anthropic.com>
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 vortex-bench/src/conversions.rs    | 55 ++++++++++++++++++++++++++++
 vortex-bench/src/vector_dataset.rs | 58 ++++++++++++------------------
 2 files changed, 77 insertions(+), 36 deletions(-)

diff --git a/vortex-bench/src/conversions.rs b/vortex-bench/src/conversions.rs
index 1a486034919..52fc1571a6a 100644
--- a/vortex-bench/src/conversions.rs
+++ b/vortex-bench/src/conversions.rs
@@ -374,6 +374,7 @@ mod tests {
     use vortex::array::IntoArray;
     use vortex::array::arrays::Extension;
     use vortex::array::arrays::List;
+    use vortex::array::arrays::ListViewArray;
     use vortex::array::arrays::PrimitiveArray;
     use vortex::array::arrays::extension::ExtensionArrayExt;
     use vortex::array::validity::Validity;
@@ -447,6 +448,60 @@ mod tests {
         );
     }
 
+    /// Build a `ListView<f32>` whose every row is a length-`dim` slice of the flattened
+    /// `values` buffer. This shape matches what `parquet_to_vortex_chunks` produces for
+    /// embedding columns after arrow-rs' canonicalization, and exercises the
+    /// `list_to_vector_ext` fast-path that collapses `ListView` → `List` before
+    /// validating offsets.
+    fn list_view_f32(dim: usize, rows: &[&[f32]]) -> vortex::array::ArrayRef {
+        let mut values = BufferMut::<f32>::with_capacity(rows.len() * dim);
+        for row in rows {
+            assert_eq!(row.len(), dim);
+            for &v in row.iter() {
+                values.push(v);
+            }
+        }
+        let elements =
+            PrimitiveArray::new::<f32>(values.freeze(), Validity::NonNullable).into_array();
+
+        let dim_i32 = i32::try_from(dim).unwrap();
+        let num_rows = rows.len();
+
+        let mut offsets_buf = BufferMut::<i32>::with_capacity(num_rows);
+        for i in 0..num_rows {
+            offsets_buf.push(i32::try_from(i).unwrap() * dim_i32);
+        }
+        let offsets =
+            PrimitiveArray::new::<i32>(offsets_buf.freeze(), Validity::NonNullable).into_array();
+
+        let mut sizes_buf = BufferMut::<i32>::with_capacity(num_rows);
+        for _ in 0..num_rows {
+            sizes_buf.push(dim_i32);
+        }
+        let sizes =
+            PrimitiveArray::new::<i32>(sizes_buf.freeze(), Validity::NonNullable).into_array();
+
+        ListViewArray::try_new(elements, offsets, sizes, Validity::NonNullable)
+            .unwrap()
+            .into_array()
+    }
+
+    #[test]
+    fn list_view_input_is_rewrapped_as_vector_extension() {
+        // Simulates the post-parquet-ingest shape: the `emb` column arrives as a
+        // ListView, not a List. `list_to_vector_ext` must materialize it via
+        // `recursive_list_from_list_view` and then validate offsets on the flattened
+        // `List` form.
+        let list_view = list_view_f32(3, &[&[1.0, 2.0, 3.0], &[4.0, 5.0, 6.0]]);
+        let wrapped = list_to_vector_ext(list_view).unwrap();
+        assert_eq!(wrapped.len(), 2);
+        let ext = wrapped.as_opt::<Extension>().expect("returns Extension");
+        assert!(matches!(
+            ext.storage_array().dtype(),
+            DType::FixedSizeList(_, 3, _)
+        ));
+    }
+
     #[test]
     fn non_float_element_type_is_rejected() {
         // Build a List<i32>.
diff --git a/vortex-bench/src/vector_dataset.rs b/vortex-bench/src/vector_dataset.rs
index 86086ac57ac..e6b049fe614 100644
--- a/vortex-bench/src/vector_dataset.rs
+++ b/vortex-bench/src/vector_dataset.rs
@@ -18,20 +18,13 @@
 use std::path::PathBuf;
 
 use anyhow::Result;
+use anyhow::bail;
 use async_trait::async_trait;
-use tokio::fs::File;
 use vortex::array::ArrayRef;
-use vortex::array::IntoArray;
-use vortex::array::stream::ArrayStreamExt;
-use vortex::file::OpenOptionsSessionExt;
-use vortex::file::WriteOptionsSessionExt;
 
 use crate::IdempotentPath;
-use crate::SESSION;
-use crate::conversions::parquet_to_vortex_chunks;
 use crate::datasets::Dataset;
 use crate::datasets::data_downloads::download_data;
-use crate::idempotent_async;
 
 /// A public embedding-vector dataset used by the vector-search benchmark.
 ///
@@ -171,35 +164,28 @@ impl Dataset for VectorDataset {
         Ok(parquet)
     }
 
+    /// **Not supported.** `VectorDataset` can't return a straight Vortex array via
+    /// [`Dataset::to_vortex_array`] because:
+    ///
+    /// - The struct-shaped array the other datasets return would arrive as
+    ///   `{ id: int64, emb: list<float> }` — with `emb` as a *list*, not the
+    ///   `Extension<Vector>(FixedSizeList<...>)` shape the vector-search benchmark
+    ///   actually operates on.
+    /// - The benchmark therefore bypasses this method entirely: it calls
+    ///   [`Dataset::to_parquet_path`] and then runs
+    ///   [`crate::conversions::parquet_to_vortex_chunks`] +
+    ///   [`crate::conversions::list_to_vector_ext`] itself, which produces the
+    ///   correct `Extension<Vector>` shape.
+    ///
+    /// Returning the raw struct here would be a trap for future callers who expect
+    /// the same semantic shape the benchmark measures. Bailing explicitly makes the
+    /// contract unambiguous.
     async fn to_vortex_array(&self) -> Result<ArrayRef> {
-        let parquet = self.to_parquet_path().await?;
-        let dir = format!("{}/", self.name()).to_data_path();
-        let vortex = dir.join(format!("{}.vortex", self.name()));
-
-        let data = parquet_to_vortex_chunks(parquet).await?;
-        idempotent_async(&vortex, async |path| -> Result<()> {
-            SESSION
-                .write_options()
-                .write(
-                    &mut File::create(path)
-                        .await
-                        .map_err(|e| anyhow::anyhow!("Failed to create file: {}", e))?,
-                    data.into_array().to_array_stream(),
-                )
-                .await
-                .map_err(|e| anyhow::anyhow!("Failed to write vortex file: {}", e))?;
-            Ok(())
-        })
-        .await?;
-
-        Ok(SESSION
-            .open_options()
-            .open_path(vortex.as_path())
-            .await?
-            .scan()?
-            .into_array_stream()?
-            .read_all()
-            .await?)
+        bail!(
+            "VectorDataset::to_vortex_array is not supported; use `to_parquet_path` + \
+             `parquet_to_vortex_chunks` + `list_to_vector_ext` to build the \
+             Extension<Vector> shape the benchmark needs"
+        );
     }
 }
 

From 8c1628b5762021954e479d81fb7b41d72fa1b611 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 11 Apr 2026 15:14:20 +0000
Subject: [PATCH 16/18] vector-search-bench: style nits from review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Five small style-only cleanups identified during the review pass.
Pure documentation + formatting — no behavior change.

1. `list_to_vector_ext` had one error message missing the
   `"list_to_vector_ext: ..."` prefix every other error used
   (`"list_to_vector_ext expects a List array ..."` → `"list_to_vector_ext:
   expected a List array ..."`). The test that asserts the substring is
   updated too.

2. `handrolled_baseline::filter_loop` now has an explicit comment
   flagging the `>` comparison as the *strict* greater-than that must
   stay in sync with the Vortex-side `Operator::Gt` in
   `build_similarity_search_tree`. A divergence between the two would
   show up as a max-abs-diff correctness failure for the lossless
   variants, but the comment makes the invariant explicit so the next
   person to touch either side thinks about it.

3. `gen_synthetic_dataset.rs` had three magic-number constants in the
   per-element value computation (`0.00013`, `0.00007`, `0.25`, plus
   `1.0/32768.0` for the random scale). Promoted them to named
   constants (`POS_FREQ_ROW`, `POS_FREQ_COL`, `POS_AMPLITUDE`,
   `RAND_SCALE`) with a comment explaining the intent: the sinusoid
   mixes the `(row, col)` position so vectors stay distinct even at
   low bit widths after quantization, and the frequency constants are
   chosen small and coprime to avoid short-period aliasing over the
   100K-row × 1536-col domain.

4. `gen_synthetic_dataset.rs`'s module doc said "bit-identical across
   runs". That overpromises: the generator uses `f32::sin`, whose last
   few ULPs are libm/CPU-dependent. Changed to "deterministic... on
   the same machine" with an explicit note about cross-machine
   variance.

Signed-off-by: Claude <noreply@anthropic.com>
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 .../src/bin/gen_synthetic_dataset.rs          | 30 ++++++++++++++-----
 .../src/handrolled_baseline.rs                |  6 +++-
 vortex-bench/src/conversions.rs               |  4 +--
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs b/benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs
index 16e96cce5b4..d8a26578bc9 100644
--- a/benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs
+++ b/benchmarks/vector-search-bench/src/bin/gen_synthetic_dataset.rs
@@ -6,8 +6,10 @@
 //! access to `assets.zilliz.com`, and for sandbox / CI environments that block outbound
 //! HTTPS.
 //!
-//! The generated file is bit-identical across runs for a given `(num_rows, dim, seed)`
-//! triple so that downstream benchmark output is reproducible.
+//! The generated file is deterministic for a given `(num_rows, dim, seed)` triple, so
+//! downstream benchmark output is reproducible across runs on the same machine. Exact
+//! bit-for-bit equality across different machines is not guaranteed because the PRNG
+//! mixes in `f32::sin()`, whose last few ULPs are libm/CPU-dependent.
 //!
 //! Example:
 //!
@@ -81,17 +83,31 @@ fn main() -> Result<()> {
     let mut offsets = Int32BufferBuilder::new(args.num_rows + 1);
     offsets.append(0i32);
 
+    // Generate per-element values as (random noise in `[-0.5, 0.5)`) + (position-based
+    // sinusoid of amplitude 0.25). The xorshift gives the random component; the sine
+    // mixes the row and column index so that vectors at different positions are distinct
+    // even at low bit widths after quantization.
+    //
+    // The sinusoid frequency constants below are deliberately small and coprime so that
+    // (row, col) → sine values don't repeat across the 100K-row × 1536-col domain any
+    // faster than ~100K rows. They don't have any particular mathematical meaning — they
+    // just need to be "slow enough to avoid short-period aliasing, fast enough that
+    // different rows look different".
+    const POS_FREQ_ROW: f32 = 0.00013;
+    const POS_FREQ_COL: f32 = 0.00007;
+    const POS_AMPLITUDE: f32 = 0.25;
+    const RAND_SCALE: f32 = 1.0 / 32768.0;
+
     let mut state = args.seed.wrapping_add(1);
     for row in 0..args.num_rows {
         for i in 0..dim_usize {
-            // Deterministic xorshift mixed with position so every vector is distinct.
             state ^= state << 13;
             state ^= state >> 7;
             state ^= state << 17;
-            let scale = 1.0f32 / 32768.0;
-            let v = ((state & 0xFFFF) as f32 * scale - 0.5)
-                + ((row as f32 * 0.00013) + (i as f32 * 0.00007)).sin() * 0.25;
-            float_values.append_value(v);
+            let rand_component = (state & 0xFFFF) as f32 * RAND_SCALE - 0.5;
+            let pos_component =
+                ((row as f32 * POS_FREQ_ROW) + (i as f32 * POS_FREQ_COL)).sin() * POS_AMPLITUDE;
+            float_values.append_value(rand_component + pos_component);
         }
         let written = i32::try_from((row + 1) * dim_usize)
             .context("offset overflows i32 — reduce num_rows or dim")?;
diff --git a/benchmarks/vector-search-bench/src/handrolled_baseline.rs b/benchmarks/vector-search-bench/src/handrolled_baseline.rs
index 260d4726857..bf91915908f 100644
--- a/benchmarks/vector-search-bench/src/handrolled_baseline.rs
+++ b/benchmarks/vector-search-bench/src/handrolled_baseline.rs
@@ -258,7 +258,11 @@ pub fn cosine_loop(elements: &[f32], num_rows: usize, dim: usize, query: &[f32])
     out
 }
 
-/// Build the `cosine > threshold` boolean mask.
+/// Build the `cosine > threshold` boolean mask — **strict greater-than**, matching the
+/// Vortex-side path which uses `Operator::Gt` in
+/// [`vortex_tensor::vector_search::build_similarity_search_tree`]. Keep these two in
+/// sync: if one changes the comparison semantics, the correctness-verification pass will
+/// start reporting a mismatch for the lossless variants.
 pub fn filter_loop(scores: &[f32], threshold: f32) -> Vec<bool> {
     scores.iter().map(|&s| s > threshold).collect()
 }
diff --git a/vortex-bench/src/conversions.rs b/vortex-bench/src/conversions.rs
index 52fc1571a6a..8793ed23b9a 100644
--- a/vortex-bench/src/conversions.rs
+++ b/vortex-bench/src/conversions.rs
@@ -278,7 +278,7 @@ pub fn list_to_vector_ext(input: ArrayRef) -> VortexResult<ArrayRef> {
 
     let Some(list) = input.as_opt::<List>() else {
         vortex_bail!(
-            "list_to_vector_ext expects a List array, got dtype {}",
+            "list_to_vector_ext: expected a List array, got dtype {}",
             input.dtype()
         );
     };
@@ -433,7 +433,7 @@ mod tests {
         .into_array();
         let err = list_to_vector_ext(primitive).unwrap_err().to_string();
         assert!(
-            err.contains("expects a List array"),
+            err.contains("expected a List array"),
             "unexpected error: {err}"
         );
     }

From 17f574d74c25db7418b0303f786eb63435fdd7ba Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 11 Apr 2026 16:01:30 +0000
Subject: [PATCH 17/18] =?UTF-8?q?vector-search-bench:=20second=20review=20?=
 =?UTF-8?q?pass=20=E2=80=94=20missed=20f32=20cast,=20imports,=20tests?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A second pass over the branch caught several items I missed in the
first review cleanup. All code-style and correctness-level, no
behavior changes beyond the missed f32 assertion.

## Missed bug

`verify::compute_cosine_scores` had the same blind
`PrimitiveArray::as_slice::<f32>()` call I already fixed in
`extract_query_row` last round — I only patched the lib.rs copy, not
the verify.rs one. Added the `scores.ptype() != PType::F32` check
with a clear error message before the slice cast. Matches the doc
contract that the function returns `Vec<f32>` and only supports f32
Vector columns today.

## CLAUDE.md style violations

CLAUDE.md's import rule: "All imports must be at the top of the
module, never inside functions. The only exception is `#[cfg(test)]`
blocks, where imports should be at the top of the test module."

I had function-scoped `use` statements sprinkled across five
functions in `lib.rs` and inside an `extract_row_zero` test helper in
`verify.rs`, all against the guidance. Moved every function-scoped
`use` into the top-level import block:

- `prepare_dataset`: 1 import moved
- `extract_emb_column`: 5 imports moved
- `extract_query_row`: 4 imports moved
- `decompress_full_scan`: 3 imports moved
- `execute_cosine`: 2 imports moved
- `PreparedDataset::dim`: 2 fully-qualified `vortex::dtype::DType::...`
  paths replaced with plain `DType::...` from top-level import

## Test duplication

`verify::tests::extract_row_zero` was a 10-line reimplementation of
`extract_query_row` inside the test module. Deleted it and made
`verify::tests::make_prepared` call `extract_query_row` directly,
which also sidestep the need for function-scoped imports in the test.
Same rework applied to `lib::tests::prepare_variant_produces_...`
(which had its own inline f32 extraction) and `recall::tests::...`
(which was building a `PreparedDataset` with an empty query).
A new `test_prepared` helper in `lib.rs` tests replaces the three
separate inline constructions with a single shared builder.

## New test coverage

Added 8 new tests across the three modules:

- `vortex-bench::conversions::all_invalid_list_validity_is_rejected`:
  builds a `List<f32>` with `Validity::AllInvalid` and confirms the
  rejection path fires. Previously unexercised.

- `vector-search-bench::tests::extract_query_row_returns_the_right_slice`:
  verifies row 0 extraction is idempotent and different rows differ.

- `vector-search-bench::tests::extract_query_row_rejects_out_of_bounds_row`:
  verifies the `row >= len` bounds check.

- `vector-search-bench::verify::tests::verify_scores_fails_on_nan_in_baseline`:
  symmetric NaN case — the existing `verify_scores_fails_on_nan` test
  only covered the variant-side NaN.

- `verify::tests::verify_and_report_scores_is_ok_for_identical_inputs`:
  direct test for the helper that main.rs's handrolled path uses.

- `verify::tests::verify_and_report_scores_bails_for_lossless_mismatch`:
  lossless failures must hard-error with a message that includes the
  variant name.

- `verify::tests::verify_and_report_scores_warns_for_lossy_mismatch_without_bailing`:
  lossy failures must NOT bail — they log a warning and return the
  failing report so the caller can still emit it as a measurement.

- `recall::tests::top_k_indices_handles_nan_without_panicking`: NaN
  scores used to sort via `partial_cmp(...).unwrap_or(Equal)`, which
  produced arbitrary orderings. Switched to `f32::total_cmp` for a
  proper total order; test confirms non-NaN scores still rank in
  descending order and the sort doesn't panic.

## Minor

- Stale comment on `main.rs::total_work` calculation: referenced
  `args.formats.len()` but the expression uses a different formula
  after commit 74aaf5a. Fixed the comment.

- `top_k_indices` switched from
  `scores[b].partial_cmp(&scores[a]).unwrap_or(Ordering::Equal)` to
  `scores[b].total_cmp(&scores[a])`. Added a doc comment on the
  function explaining the NaN-safety rationale.

## Test counts after this commit

- vortex-bench: 18 tests (was 17, +1 for `all_invalid_list_validity`)
- vector-search-bench: 20 tests passing + 1 ignored (was 13 + 1,
  +7 for the new coverage above)
- vortex-tensor: 222 tests (unchanged)

Clippy / rustfmt clean. End-to-end smoke run on synthetic
cohere-small still reports:
- all four correctness-max-diff values coherent (0.000e0 for the three
  lossless variants, 5.18e-3 for TurboQuant)
- Recall@10 = 0.91 for TurboQuant

Signed-off-by: Claude <noreply@anthropic.com>
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 benchmarks/vector-search-bench/src/lib.rs    | 140 ++++++++++---------
 benchmarks/vector-search-bench/src/main.rs   |   6 +-
 benchmarks/vector-search-bench/src/recall.rs |  42 +++++-
 benchmarks/vector-search-bench/src/verify.rs |  97 ++++++++++---
 vortex-bench/src/conversions.rs              |  25 ++++
 5 files changed, 210 insertions(+), 100 deletions(-)

diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs
index d5d5df1f1a2..f710e9ba8aa 100644
--- a/benchmarks/vector-search-bench/src/lib.rs
+++ b/benchmarks/vector-search-bench/src/lib.rs
@@ -41,11 +41,23 @@ use anyhow::Result;
 use anyhow::bail;
 use clap::ValueEnum;
 use vortex::array::ArrayRef;
+use vortex::array::ExecutionCtx;
 use vortex::array::IntoArray;
 use vortex::array::VortexSessionExecute;
 use vortex::array::arrays::BoolArray;
+use vortex::array::arrays::Chunked;
+use vortex::array::arrays::ChunkedArray;
+use vortex::array::arrays::Extension;
+use vortex::array::arrays::ExtensionArray;
 use vortex::array::arrays::FixedSizeListArray;
 use vortex::array::arrays::PrimitiveArray;
+use vortex::array::arrays::Struct;
+use vortex::array::arrays::chunked::ChunkedArrayExt;
+use vortex::array::arrays::extension::ExtensionArrayExt;
+use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt;
+use vortex::array::arrays::struct_::StructArrayExt as _;
+use vortex::dtype::DType;
+use vortex::dtype::PType;
 use vortex::session::VortexSession;
 use vortex_bench::Format;
 use vortex_bench::SESSION;
@@ -54,6 +66,8 @@ use vortex_bench::conversions::parquet_to_vortex_chunks;
 use vortex_bench::datasets::Dataset;
 use vortex_bench::vector_dataset::VectorDataset;
 use vortex_btrblocks::BtrBlocksCompressor;
+use vortex_tensor::scalar_fns::cosine_similarity::CosineSimilarity;
+use vortex_tensor::vector_search::build_constant_query_vector;
 use vortex_tensor::vector_search::build_similarity_search_tree;
 use vortex_tensor::vector_search::compress_turboquant;
 
@@ -137,16 +151,12 @@ impl PreparedDataset {
     /// and it guarantees this shape.
     pub fn dim(&self) -> u32 {
         let fsl_dtype = match self.uncompressed.dtype() {
-            vortex::dtype::DType::Extension(ext) => ext.storage_dtype(),
-            other => {
-                vortex::error::vortex_panic!("expected Extension<Vector>, got {other}")
-            }
+            DType::Extension(ext) => ext.storage_dtype(),
+            other => vortex::error::vortex_panic!("expected Extension<Vector>, got {other}"),
         };
         match fsl_dtype {
-            vortex::dtype::DType::FixedSizeList(_, dim, _) => *dim,
-            other => {
-                vortex::error::vortex_panic!("expected FixedSizeList storage, got {other}")
-            }
+            DType::FixedSizeList(_, dim, _) => *dim,
+            other => vortex::error::vortex_panic!("expected FixedSizeList storage, got {other}"),
         }
     }
 
@@ -159,8 +169,6 @@ impl PreparedDataset {
 /// Prepare a dataset by downloading its parquet file, converting the `emb` column to a
 /// `Vector<dim, f32>` extension array, and extracting a single-row query vector.
 pub async fn prepare_dataset(dataset: &VectorDataset) -> Result<PreparedDataset> {
-    use vortex::array::arrays::ExtensionArray;
-
     let parquet_path = dataset
         .to_parquet_path()
         .await
@@ -202,12 +210,6 @@ pub async fn prepare_dataset(dataset: &VectorDataset) -> Result<PreparedDataset>
 /// Project the `emb` column out of a chunked struct array. This rebuilds a chunked list
 /// array with just that one column.
 fn extract_emb_column(struct_array: &ArrayRef) -> Result<ArrayRef> {
-    use vortex::array::arrays::Chunked;
-    use vortex::array::arrays::ChunkedArray;
-    use vortex::array::arrays::Struct;
-    use vortex::array::arrays::chunked::ChunkedArrayExt;
-    use vortex::array::arrays::struct_::StructArrayExt as _;
-
     if let Some(chunked) = struct_array.as_opt::<Chunked>() {
         let mut emb_chunks: Vec<ArrayRef> = Vec::with_capacity(chunked.nchunks());
         for chunk in chunked.iter_chunks() {
@@ -238,11 +240,6 @@ fn extract_emb_column(struct_array: &ArrayRef) -> Result<ArrayRef> {
 /// restricts itself to `f32` vectors, so we assert the element type rather than
 /// quietly returning a mis-cast slice.
 pub(crate) fn extract_query_row(vector_ext: &ArrayRef, row: usize) -> Result<Vec<f32>> {
-    use vortex::array::arrays::Extension;
-    use vortex::array::arrays::extension::ExtensionArrayExt;
-    use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt;
-    use vortex::dtype::PType;
-
     if row >= vector_ext.len() {
         bail!(
             "query row {row} out of bounds for dataset of length {}",
@@ -260,7 +257,7 @@ pub(crate) fn extract_query_row(vector_ext: &ArrayRef, row: usize) -> Result<Vec
     let fsl: FixedSizeListArray = ext_view.storage_array().clone().execute(&mut ctx)?;
 
     let dim_usize = match fsl.dtype() {
-        vortex::dtype::DType::FixedSizeList(_, d, _) => *d as usize,
+        DType::FixedSizeList(_, d, _) => *d as usize,
         other => bail!("storage dtype must be FixedSizeList, got {other}"),
     };
 
@@ -437,12 +434,8 @@ pub struct VariantTimings {
 /// pipeline.
 pub fn decompress_full_scan(
     array: &ArrayRef,
-    ctx: &mut vortex::array::ExecutionCtx,
+    ctx: &mut ExecutionCtx,
 ) -> Result<FixedSizeListArray> {
-    use vortex::array::arrays::ExtensionArray;
-    use vortex::array::arrays::extension::ExtensionArrayExt;
-    use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt;
-
     let ext: ExtensionArray = array.clone().execute(ctx)?;
     let fsl: FixedSizeListArray = ext.storage_array().clone().execute(ctx)?;
     // Force the element buffer all the way down to a canonical PrimitiveArray so the
@@ -465,11 +458,8 @@ pub fn decompress_full_scan(
 pub fn execute_cosine(
     data: &ArrayRef,
     query: &[f32],
-    ctx: &mut vortex::array::ExecutionCtx,
+    ctx: &mut ExecutionCtx,
 ) -> Result<PrimitiveArray> {
-    use vortex_tensor::scalar_fns::cosine_similarity::CosineSimilarity;
-    use vortex_tensor::vector_search::build_constant_query_vector;
-
     let num_rows = data.len();
     let query_vec = build_constant_query_vector(query, num_rows)?;
     let cosine = CosineSimilarity::try_new_array(data.clone(), query_vec, num_rows)?.into_array();
@@ -480,7 +470,7 @@ fn execute_filter(
     data: &ArrayRef,
     query: &[f32],
     threshold: f32,
-    ctx: &mut vortex::array::ExecutionCtx,
+    ctx: &mut ExecutionCtx,
 ) -> Result<BoolArray> {
     let tree = build_similarity_search_tree(data.clone(), query, threshold)?;
     Ok(tree.execute(ctx)?)
@@ -525,36 +515,62 @@ pub(crate) mod test_utils {
 
 #[cfg(test)]
 mod tests {
-    use vortex::array::arrays::FixedSizeListArray;
-    use vortex::array::arrays::PrimitiveArray;
-    use vortex::array::arrays::extension::ExtensionArrayExt;
-    use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt;
     use vortex_bench::SESSION;
 
     use super::test_utils::synthetic_vector;
     use super::*;
 
+    /// Build a test `PreparedDataset` from synthetic data, pulling the query from
+    /// row 0 via the shared `extract_query_row` helper so all tests exercise the
+    /// ptype-assertion path the benchmark hot path uses.
+    fn test_prepared(dim: u32, num_rows: usize, seed: u64) -> PreparedDataset {
+        let uncompressed = synthetic_vector(dim, num_rows, seed);
+        let query = extract_query_row(&uncompressed, 0).unwrap();
+        PreparedDataset {
+            name: "synthetic".to_string(),
+            uncompressed,
+            query,
+            parquet_bytes: 0,
+        }
+    }
+
+    #[test]
+    fn extract_query_row_returns_the_right_slice() {
+        let dim = 8u32;
+        let num_rows = 4usize;
+        let prepared = test_prepared(dim, num_rows, 0xDEADBEEF);
+
+        // Row 0 extraction was already used to populate `prepared.query`; check it
+        // agrees with a second extraction for row 0, and that row 3 (last) is
+        // different (as it should be for distinct synthetic vectors).
+        let row0 = extract_query_row(&prepared.uncompressed, 0).unwrap();
+        let row3 = extract_query_row(&prepared.uncompressed, 3).unwrap();
+        assert_eq!(row0, prepared.query);
+        assert_eq!(row0.len(), dim as usize);
+        assert_eq!(row3.len(), dim as usize);
+        assert_ne!(row0, row3, "different rows must differ for this seed");
+    }
+
+    #[test]
+    fn extract_query_row_rejects_out_of_bounds_row() {
+        let dim = 8u32;
+        let num_rows = 4usize;
+        let prepared = test_prepared(dim, num_rows, 0xC0FFEE);
+
+        let err = extract_query_row(&prepared.uncompressed, 4)
+            .unwrap_err()
+            .to_string();
+        assert!(
+            err.contains("query row 4 out of bounds"),
+            "unexpected error: {err}"
+        );
+    }
+
     #[test]
     fn prepare_variant_produces_non_empty_array_for_all_variants() {
         let dim = 128u32;
         let num_rows = 64usize;
-        let uncompressed = synthetic_vector(dim, num_rows, 0xC0FFEE);
-
-        let ext = uncompressed
-            .as_opt::<vortex::array::arrays::Extension>()
-            .unwrap();
-        let mut ctx = SESSION.create_execution_ctx();
-        let fsl: FixedSizeListArray = ext.storage_array().clone().execute(&mut ctx).unwrap();
-        let elements: PrimitiveArray = fsl.elements().clone().execute(&mut ctx).unwrap();
-        let slice = elements.as_slice::<f32>();
-        let query = slice[..dim as usize].to_vec();
-
-        let prepared = PreparedDataset {
-            name: "synthetic".to_string(),
-            uncompressed: uncompressed.clone(),
-            query,
-            parquet_bytes: 0,
-        };
+        let prepared = test_prepared(dim, num_rows, 0xC0FFEE);
 
         for variant in [
             Variant::VortexUncompressed,
@@ -585,14 +601,7 @@ mod tests {
     fn uncompressed_decompress_is_fast() {
         let dim = 128u32;
         let num_rows = 256usize;
-        let uncompressed = synthetic_vector(dim, num_rows, 0xDEADBEEF);
-
-        let prepared = PreparedDataset {
-            name: "synthetic".to_string(),
-            uncompressed,
-            query: vec![0.1f32; dim as usize],
-            parquet_bytes: 0,
-        };
+        let prepared = test_prepared(dim, num_rows, 0xDEADBEEF);
 
         let uncompressed_prep =
             prepare_variant(&prepared, Variant::VortexUncompressed, &SESSION).unwrap();
@@ -628,14 +637,7 @@ mod tests {
     fn print_variant_trees() {
         let dim = 768u32;
         let num_rows = 500usize;
-        let uncompressed = synthetic_vector(dim, num_rows, 0xC0FFEE);
-
-        let prepared = PreparedDataset {
-            name: "synthetic".to_string(),
-            uncompressed,
-            query: vec![0.1f32; dim as usize],
-            parquet_bytes: 0,
-        };
+        let prepared = test_prepared(dim, num_rows, 0xC0FFEE);
 
         for variant in [
             Variant::VortexUncompressed,
diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs
index 7cae73cf823..261d07becb3 100644
--- a/benchmarks/vector-search-bench/src/main.rs
+++ b/benchmarks/vector-search-bench/src/main.rs
@@ -179,9 +179,9 @@ async fn main() -> Result<()> {
         .filter_map(|f| f.into_variant())
         .collect();
 
-    // `args.formats.len()` counts both the handrolled baseline and the Vortex variants,
-    // so it matches the number of `progress.inc(1)` calls we'll make below (one per
-    // Vortex variant plus one per dataset for the handrolled path when it's enabled).
+    // One progress unit per inner-loop body: each Vortex variant plus the handrolled
+    // path (when it's enabled) gets exactly one `progress.inc(1)` below. Keep this
+    // count in sync with the number of `progress.inc` sites.
     let total_work = datasets.len() * (variants.len() + usize::from(run_handrolled_baseline));
     let progress = ProgressBar::new(total_work as u64);
 
diff --git a/benchmarks/vector-search-bench/src/recall.rs b/benchmarks/vector-search-bench/src/recall.rs
index c091b74305e..e4fbd721f28 100644
--- a/benchmarks/vector-search-bench/src/recall.rs
+++ b/benchmarks/vector-search-bench/src/recall.rs
@@ -75,13 +75,15 @@ pub fn measure_recall_at_k(
 }
 
 /// Return the indices of the top-K highest scores, stable-sorted descending.
+///
+/// Uses `f32::total_cmp` for a NaN-safe total order — `partial_cmp` would panic on
+/// NaN, and `partial_cmp(...).unwrap_or(Ordering::Equal)` would put NaNs at
+/// arbitrary positions. `total_cmp` gives NaNs a well-defined (but meaningless) sort
+/// slot, which lets the function be robust against accidental NaN inputs without
+/// silently hiding them.
 fn top_k_indices(scores: &[f32], top_k: usize) -> Vec<usize> {
     let mut idx: Vec<usize> = (0..scores.len()).collect();
-    idx.sort_by(|&a, &b| {
-        scores[b]
-            .partial_cmp(&scores[a])
-            .unwrap_or(std::cmp::Ordering::Equal)
-    });
+    idx.sort_by(|&a, &b| scores[b].total_cmp(&scores[a]));
     idx.truncate(top_k);
     idx
 }
@@ -92,9 +94,34 @@ mod tests {
 
     use super::*;
     use crate::Variant;
+    use crate::extract_query_row;
     use crate::prepare_variant;
     use crate::test_utils::synthetic_vector;
 
+    #[test]
+    fn top_k_indices_handles_nan_without_panicking() {
+        // `partial_cmp` panics on NaN (well, returns None, which was silently swallowed
+        // before). `total_cmp` gives NaN a well-defined slot, so the sort doesn't
+        // panic and doesn't produce arbitrary orderings for non-NaN elements.
+        let scores = [0.9f32, f32::NAN, 0.7, 0.5, f32::NAN];
+        let top = top_k_indices(&scores, 3);
+        assert_eq!(top.len(), 3);
+        // The finite values 0.9, 0.7, 0.5 should still rank in the right order
+        // relative to each other — NaNs sort somewhere, but the finite ordering is
+        // preserved because `total_cmp` is a total order.
+        let finite_positions: Vec<usize> = top
+            .iter()
+            .copied()
+            .filter(|&i| !scores[i].is_nan())
+            .collect();
+        assert!(
+            finite_positions
+                .windows(2)
+                .all(|w| scores[w[0]] >= scores[w[1]]),
+            "finite scores should still be in descending order"
+        );
+    }
+
     #[test]
     fn uncompressed_has_perfect_self_recall() {
         let dim = 128u32;
@@ -114,10 +141,13 @@ mod tests {
         let num_rows = 64usize;
         let uncompressed = synthetic_vector(dim, num_rows, 0xC0FFEE);
 
+        // `measure_recall_at_k` doesn't need the PreparedDataset's `query` field —
+        // it derives queries internally via `extract_query_row` on `uncompressed`.
+        // Construct just enough of a `PreparedDataset` to pass to `prepare_variant`.
         let prepared = crate::PreparedDataset {
             name: "synthetic".to_string(),
             uncompressed: uncompressed.clone(),
-            query: vec![],
+            query: extract_query_row(&uncompressed, 0).unwrap(),
             parquet_bytes: 0,
         };
 
diff --git a/benchmarks/vector-search-bench/src/verify.rs b/benchmarks/vector-search-bench/src/verify.rs
index cbca8a3adc5..ca58ff44b13 100644
--- a/benchmarks/vector-search-bench/src/verify.rs
+++ b/benchmarks/vector-search-bench/src/verify.rs
@@ -24,6 +24,7 @@ use anyhow::Result;
 use anyhow::bail;
 use vortex::array::ArrayRef;
 use vortex::array::VortexSessionExecute;
+use vortex::dtype::PType;
 use vortex::session::VortexSession;
 
 use crate::execute_cosine;
@@ -78,9 +79,17 @@ impl VerificationReport {
 }
 
 /// Compute cosine-similarity scores for a single query row on `data` and return them
-/// as a plain `Vec<f32>`. This is just a convenience wrapper around
+/// as a plain `Vec<f32>`. This is a convenience wrapper around
 /// [`crate::execute_cosine`] that pulls the f32 slice out of the resulting
 /// `PrimitiveArray`.
+///
+/// # Errors
+///
+/// Returns an error if [`execute_cosine`] fails (bad input shape or dispatch error),
+/// or if the cosine expression produces a non-`f32` primitive array. The latter can't
+/// happen today because the benchmark only wires `f32` `Vector` columns, but the
+/// explicit ptype check keeps the function sound if the scalar-fn output type ever
+/// widens (e.g. to `f64`) without the caller noticing.
 pub fn compute_cosine_scores(
     data: &ArrayRef,
     query: &[f32],
@@ -88,6 +97,12 @@ pub fn compute_cosine_scores(
 ) -> Result<Vec<f32>> {
     let mut ctx = session.create_execution_ctx();
     let scores = execute_cosine(data, query, &mut ctx)?;
+    if scores.ptype() != PType::F32 {
+        bail!(
+            "compute_cosine_scores: cosine output must be f32, got {:?}",
+            scores.ptype()
+        );
+    }
     Ok(scores.as_slice::<f32>().to_vec())
 }
 
@@ -207,35 +222,26 @@ mod tests {
 
     use super::*;
     use crate::Variant;
+    use crate::extract_query_row;
     use crate::prepare_variant;
     use crate::test_utils::synthetic_vector;
 
+    /// Build a `PreparedDataset` whose `query` is row 0 of the dataset. Using
+    /// `extract_query_row` here (rather than a test-local f32 extraction helper) also
+    /// keeps the test surface covered by the same ptype-assertion path the benchmark
+    /// hot path uses.
     fn make_prepared(dim: u32, num_rows: usize, seed: u64) -> crate::PreparedDataset {
         let uncompressed = synthetic_vector(dim, num_rows, seed);
+        let query = extract_query_row(&uncompressed, 0).unwrap();
+        assert_eq!(query.len(), dim as usize);
         crate::PreparedDataset {
             name: "synthetic".to_string(),
             uncompressed,
-            // Filled in below from row 0.
-            query: vec![],
+            query,
             parquet_bytes: 0,
         }
     }
 
-    fn extract_row_zero(uncompressed: &ArrayRef, dim: u32) -> Vec<f32> {
-        use vortex::array::VortexSessionExecute;
-        use vortex::array::arrays::Extension;
-        use vortex::array::arrays::FixedSizeListArray;
-        use vortex::array::arrays::PrimitiveArray;
-        use vortex::array::arrays::extension::ExtensionArrayExt;
-        use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt;
-
-        let mut ctx = SESSION.create_execution_ctx();
-        let ext = uncompressed.as_opt::<Extension>().unwrap();
-        let fsl: FixedSizeListArray = ext.storage_array().clone().execute(&mut ctx).unwrap();
-        let elements: PrimitiveArray = fsl.elements().clone().execute(&mut ctx).unwrap();
-        elements.as_slice::<f32>()[..dim as usize].to_vec()
-    }
-
     #[test]
     fn compare_scores_handles_empty() {
         let (mean, max) = compare_scores(&[], &[]);
@@ -305,12 +311,60 @@ mod tests {
         assert!(report.max_abs_diff.is_infinite());
     }
 
+    #[test]
+    fn verify_scores_fails_on_nan_in_baseline() {
+        // Symmetric case: NaN on the baseline side should also fail, not just variant.
+        let base = [0.5f32, f32::NAN];
+        let other = [0.5f32, 0.5];
+        let report = verify_scores(&base, &other, VerificationKind::Lossless);
+        assert!(!report.passed);
+        assert!(report.max_abs_diff.is_infinite());
+    }
+
+    #[test]
+    fn verify_and_report_scores_is_ok_for_identical_inputs() {
+        let base = [0.5f32; 10];
+        let report =
+            verify_and_report_scores("self", &base, &base, VerificationKind::Lossless).unwrap();
+        assert!(report.passed);
+        assert_eq!(report.max_abs_diff, 0.0);
+    }
+
+    #[test]
+    fn verify_and_report_scores_bails_for_lossless_mismatch() {
+        let base = [0.5f32; 10];
+        let mut other = [0.5f32; 10];
+        other[3] = 0.6;
+        let err =
+            verify_and_report_scores("broken-variant", &other, &base, VerificationKind::Lossless)
+                .unwrap_err()
+                .to_string();
+        assert!(
+            err.contains("broken-variant correctness check failed"),
+            "unexpected error: {err}"
+        );
+    }
+
+    #[test]
+    fn verify_and_report_scores_warns_for_lossy_mismatch_without_bailing() {
+        // A lossy variant outside its tolerance should NOT bail — it logs a warning
+        // and returns the failing report so the caller can still emit the
+        // measurement and show recall alongside it.
+        let base = [0.9f32; 10];
+        let mut other = [0.9f32; 10];
+        other[0] = 1.5; // diff of 0.6, above the 0.2 lossy tolerance
+        let report =
+            verify_and_report_scores("too-lossy-variant", &other, &base, VerificationKind::Lossy)
+                .expect("lossy failures should not bail");
+        assert!(!report.passed);
+        assert!(report.max_abs_diff > f64::from(LOSSY_TOLERANCE));
+    }
+
     #[test]
     fn vortex_default_matches_uncompressed_end_to_end() {
         let dim = 128u32;
         let num_rows = 64usize;
-        let mut prepared = make_prepared(dim, num_rows, 0xC0FFEE);
-        prepared.query = extract_row_zero(&prepared.uncompressed, dim);
+        let prepared = make_prepared(dim, num_rows, 0xC0FFEE);
 
         let baseline_scores =
             compute_cosine_scores(&prepared.uncompressed, &prepared.query, &SESSION).unwrap();
@@ -332,8 +386,7 @@ mod tests {
     fn vortex_turboquant_stays_within_lossy_tolerance() {
         let dim = 128u32;
         let num_rows = 64usize;
-        let mut prepared = make_prepared(dim, num_rows, 0xDEADBEEF);
-        prepared.query = extract_row_zero(&prepared.uncompressed, dim);
+        let prepared = make_prepared(dim, num_rows, 0xDEADBEEF);
 
         let baseline_scores =
             compute_cosine_scores(&prepared.uncompressed, &prepared.query, &SESSION).unwrap();
diff --git a/vortex-bench/src/conversions.rs b/vortex-bench/src/conversions.rs
index 8793ed23b9a..70330e77f91 100644
--- a/vortex-bench/src/conversions.rs
+++ b/vortex-bench/src/conversions.rs
@@ -502,6 +502,31 @@ mod tests {
         ));
     }
 
+    #[test]
+    fn all_invalid_list_validity_is_rejected() {
+        // A list with `Validity::AllInvalid` means every row is null. The Vector
+        // extension type requires non-nullable elements at the FSL level, so we
+        // must reject this input rather than silently dropping the validity mask.
+        let elements = PrimitiveArray::new::<f32>(
+            BufferMut::<f32>::from_iter([1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0]).freeze(),
+            Validity::NonNullable,
+        )
+        .into_array();
+        let offsets = PrimitiveArray::new::<i32>(
+            BufferMut::<i32>::from_iter([0i32, 3, 6]).freeze(),
+            Validity::NonNullable,
+        )
+        .into_array();
+        let list =
+            vortex::array::Array::<List>::new(elements, offsets, Validity::AllInvalid).into_array();
+
+        let err = list_to_vector_ext(list).unwrap_err().to_string();
+        assert!(
+            err.contains("list rows must be non-nullable"),
+            "unexpected error: {err}"
+        );
+    }
+
     #[test]
     fn non_float_element_type_is_rejected() {
         // Build a List<i32>.

From 87ac1c683f948c736cb20b22443693544ce8391a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 11 Apr 2026 16:20:24 +0000
Subject: [PATCH 18/18] =?UTF-8?q?vector-search-bench:=20third=20review=20p?=
 =?UTF-8?q?ass=20=E2=80=94=20drop=20session,=20dedupe=20parquet=20reads?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three targeted fixes from the third review pass.

## 1. Drop vestigial `session: &VortexSession` parameters

The benchmark always passed `&vortex_bench::SESSION` to every function
that took a session parameter, so the parameter was plumbing with zero
value. More importantly it was *inconsistent* — some internal helpers
(like `extract_query_row`) already hardcoded the global, so the API
suggested a custom session was respected end-to-end when it wasn't.

Removed the `session` parameter from:
- `lib::prepare_variant(prepared, variant)`
- `lib::run_timings(variant_array, query, iterations)`
- `verify::compute_cosine_scores(data, query)`
- `verify::verify_variant(name, array, query, baseline, kind)`
- `recall::measure_recall_at_k(uncompressed, compressed, num_queries, top_k)`

Each function now calls `vortex_bench::SESSION.create_execution_ctx()`
directly where needed. Doc comments note the "uses the global SESSION"
contract explicitly. `execute_cosine`, `decompress_full_scan`, and
`execute_filter` still take `&mut ExecutionCtx` because they're the
low-level primitives that `run_timings` constructs fresh contexts for
on each iteration.

All call sites in `main.rs` and the test modules updated.

## 2. Fix the handrolled double-parquet-read

`main.rs` previously did:

1. Read parquet → run cosine → verify against baseline → bail on fail
2. Call `run_handrolled_baseline_timings`, which reads parquet N more
   times inside its own loop

Total: N+1 parquet reads per dataset per run. The extra read was
wasted work — `cosine_loop` is deterministic, so the scores from
iteration N of the timing loop are bit-identical to a one-shot
pre-timing computation.

Restructured `run_handrolled_baseline_timings` to return a new
`HandrolledBaselineResult { timings, last_scores }` where `last_scores`
is the `Vec<f32>` produced by the final iteration. `main.rs` now:

1. Call `run_handrolled_baseline_timings` → gets timings + last_scores
2. Call `verify_and_report_scores` with those last_scores vs baseline
   → bails on lossless mismatch

Total: N parquet reads per dataset per run. One less read, and the
verification now uses the exact same scores the timing loop computed
rather than a separate pass.

Order reversal note: verification now runs *after* timing instead of
before. The handrolled loop is cheap enough (~5 ms cosine + ~40 ms
parquet read per iter) that running it once even for a broken variant
is acceptable, and the correctness bail still prevents the run from
reporting wrong-answer numbers.

`run_handrolled_baseline_timings` now asserts `iterations >= 1` up
front (previously relied on `last_scores` being populated after the
loop).

## 3. Nullable element type test

`list_to_vector_ext` has a rejection path for `List<f32?>` — element
dtype is a nullable f32 (even when every value is present). Added
`conversions::tests::nullable_element_dtype_is_rejected` to exercise
this path. Passing `Validity::AllValid` to `PrimitiveArray::new::<f32>`
produces the nullable dtype the rejection path is looking for.

## Also: housekeeping

- Dropped `use vortex_bench::SESSION` from the `lib.rs` inner test
  module (was left over from the session-parameter removal).
- Replaced two fully-qualified `vortex::error::vortex_panic!` calls in
  `PreparedDataset::dim()` with a plain `vortex_panic!` sourced from a
  top-level import.
- Refreshed the doc comment on `PreparedDataset` — it previously
  described the struct as carrying an "execution session / context",
  which is no longer accurate.
- Added two new tests for `run_handrolled_baseline_timings`:
  - `run_handrolled_baseline_timings_returns_last_iteration_scores`:
    verifies the `last_scores` contract on a 3-row fixture.
  - `run_handrolled_baseline_timings_panics_on_zero_iterations`:
    regression guard for the iterations>=1 assert.

## Test counts

- vortex-bench: 19 tests (was 18, +1 for nullable_element_dtype)
- vector-search-bench: 22 passing + 1 ignored (was 20+1, +2 for the
  handrolled timing tests)
- vortex-tensor: 222 tests (unchanged)
- Total: 263 passing, 3 ignored

clippy / rustfmt clean. End-to-end smoke run still coherent:
- correctness-max-diff = 0.000e0 for all three lossless variants
- correctness-max-diff = 5.18e-3 for TurboQuant (within 0.2 tolerance)
- Recall@10 for TurboQuant = 0.9267

Signed-off-by: Claude <noreply@anthropic.com>
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 .../src/handrolled_baseline.rs                | 86 +++++++++++++++++--
 benchmarks/vector-search-bench/src/lib.rs     | 51 +++++------
 benchmarks/vector-search-bench/src/main.rs    | 56 +++++-------
 benchmarks/vector-search-bench/src/recall.rs  | 17 ++--
 benchmarks/vector-search-bench/src/verify.rs  | 29 +++----
 vortex-bench/src/conversions.rs               | 29 +++++++
 6 files changed, 168 insertions(+), 100 deletions(-)

diff --git a/benchmarks/vector-search-bench/src/handrolled_baseline.rs b/benchmarks/vector-search-bench/src/handrolled_baseline.rs
index bf91915908f..f62c5198fb6 100644
--- a/benchmarks/vector-search-bench/src/handrolled_baseline.rs
+++ b/benchmarks/vector-search-bench/src/handrolled_baseline.rs
@@ -187,20 +187,51 @@ pub struct HandrolledBaselineData {
     pub num_rows: usize,
 }
 
+/// Result of running the hand-rolled baseline timing loop.
+///
+/// Carries both the best-of-N timing numbers **and** the cosine scores from the final
+/// iteration. The scores are exposed so the caller can feed them into
+/// [`crate::verify::verify_and_report_scores`] for the correctness check without
+/// re-reading the parquet file. Because `cosine_loop` is deterministic, the scores
+/// from any iteration equal the scores from every other iteration; using the last
+/// one is simply the most convenient snapshot.
+pub struct HandrolledBaselineResult {
+    /// Best-of-N wall times for decompress / cosine / filter.
+    pub timings: VariantTimings,
+    /// Cosine-similarity scores from the final iteration. Length equals the dataset
+    /// row count.
+    pub last_scores: Vec<f32>,
+}
+
 /// Run the decompress / cosine / filter microbenchmarks for the hand-rolled baseline
-/// and return the best-of-N wall times. The decompress phase re-reads the parquet file
-/// from disk on each iteration (matches how the Vortex variants re-execute their tree
-/// from scratch each iteration), and the compute phase runs [`cosine_loop`] and
-/// [`filter_loop`] over the flat `Vec<f32>` the decompress phase produced.
+/// and return the best-of-N wall times along with the last iteration's cosine scores.
+///
+/// The decompress phase re-reads the parquet file from disk on each iteration (matches
+/// how the Vortex variants re-execute their tree from scratch each iteration), and the
+/// compute phase runs [`cosine_loop`] and [`filter_loop`] over the flat `Vec<f32>` the
+/// decompress phase produced. Returning the last iteration's scores lets the caller
+/// perform correctness verification against the Vortex baseline without a redundant
+/// parquet read.
+///
+/// # Panics
+///
+/// Panics if `iterations == 0`. The benchmark CLI defaults to 5 and the lowest
+/// meaningful value is 1 (single-shot best-of-1).
 pub fn run_handrolled_baseline_timings(
     parquet_path: &Path,
     query: &[f32],
     threshold: f32,
     iterations: usize,
-) -> Result<VariantTimings> {
+) -> Result<HandrolledBaselineResult> {
+    assert!(
+        iterations > 0,
+        "run_handrolled_baseline_timings requires iterations >= 1"
+    );
+
     let mut decompress = Duration::MAX;
     let mut cosine = Duration::MAX;
     let mut filter = Duration::MAX;
+    let mut last_scores: Vec<f32> = Vec::new();
 
     for _ in 0..iterations {
         let start = Instant::now();
@@ -216,12 +247,17 @@ pub fn run_handrolled_baseline_timings(
         let matches = filter_loop(&scores, threshold);
         filter = filter.min(start.elapsed());
         debug_assert_eq!(matches.len(), data.num_rows);
+
+        last_scores = scores;
     }
 
-    Ok(VariantTimings {
-        decompress,
-        cosine,
-        filter,
+    Ok(HandrolledBaselineResult {
+        timings: VariantTimings {
+            decompress,
+            cosine,
+            filter,
+        },
+        last_scores,
     })
 }
 
@@ -330,4 +366,36 @@ mod tests {
         let mask = filter_loop(&scores, 0.5);
         assert_eq!(mask, vec![true, false, true]);
     }
+
+    #[test]
+    fn run_handrolled_baseline_timings_returns_last_iteration_scores() {
+        // Verifies the new `last_scores` contract: the timing loop returns the
+        // cosine scores from the final iteration, and those scores match what we'd
+        // get from a one-shot `cosine_loop` on the same data. Callers of
+        // `run_handrolled_baseline_timings` rely on this for verification (so they
+        // don't need a second parquet read to compute ground-truth scores).
+        let file =
+            write_tiny_fsl_parquet(3, &[&[1.0, 0.0, 0.0], &[0.0, 1.0, 0.0], &[1.0, 0.0, 0.0]])
+                .unwrap();
+        let query = [1.0f32, 0.0, 0.0];
+
+        let result = run_handrolled_baseline_timings(file.path(), &query, 0.5, 3).unwrap();
+
+        // Deterministic expected scores: rows 0 and 2 match the query exactly,
+        // row 1 is orthogonal.
+        assert_eq!(result.last_scores, vec![1.0, 0.0, 1.0]);
+        assert!(result.timings.decompress > Duration::ZERO);
+        assert!(result.timings.cosine > Duration::ZERO);
+        assert!(result.timings.filter > Duration::ZERO);
+    }
+
+    #[test]
+    #[should_panic(expected = "iterations >= 1")]
+    fn run_handrolled_baseline_timings_panics_on_zero_iterations() {
+        let file =
+            write_tiny_fsl_parquet(3, &[&[1.0, 0.0, 0.0], &[0.0, 1.0, 0.0], &[1.0, 0.0, 0.0]])
+                .unwrap();
+        let query = [1.0f32, 0.0, 0.0];
+        let _result = run_handrolled_baseline_timings(file.path(), &query, 0.5, 0);
+    }
 }
diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs
index f710e9ba8aa..0a70fdfed1e 100644
--- a/benchmarks/vector-search-bench/src/lib.rs
+++ b/benchmarks/vector-search-bench/src/lib.rs
@@ -58,7 +58,7 @@ use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt;
 use vortex::array::arrays::struct_::StructArrayExt as _;
 use vortex::dtype::DType;
 use vortex::dtype::PType;
-use vortex::session::VortexSession;
+use vortex::error::vortex_panic;
 use vortex_bench::Format;
 use vortex_bench::SESSION;
 use vortex_bench::conversions::list_to_vector_ext;
@@ -126,7 +126,8 @@ impl Variant {
     }
 }
 
-/// A materialized Vortex array and its associated execution session / context.
+/// The ingested form of a dataset, ready to be fed to [`prepare_variant`] and the
+/// timing/verification pipeline.
 pub struct PreparedDataset {
     /// Name used in metric strings — usually the dataset's `Dataset::name()`.
     pub name: String,
@@ -152,11 +153,11 @@ impl PreparedDataset {
     pub fn dim(&self) -> u32 {
         let fsl_dtype = match self.uncompressed.dtype() {
             DType::Extension(ext) => ext.storage_dtype(),
-            other => vortex::error::vortex_panic!("expected Extension<Vector>, got {other}"),
+            other => vortex_panic!("expected Extension<Vector>, got {other}"),
         };
         match fsl_dtype {
             DType::FixedSizeList(_, dim, _) => *dim,
-            other => vortex::error::vortex_panic!("expected FixedSizeList storage, got {other}"),
+            other => vortex_panic!("expected FixedSizeList storage, got {other}"),
         }
     }
 
@@ -292,7 +293,8 @@ pub struct PreparedVariant {
 
 /// Apply a `Variant`'s preparation strategy to the materialized uncompressed source and
 /// return the resulting tree together with its reported in-memory size and construction
-/// time.
+/// time. Uses the global [`vortex_bench::SESSION`] for any execution-context work; the
+/// benchmark has no reason to support multiple concurrent sessions.
 ///
 /// **Why nbytes instead of on-disk size?** The Vortex file writer applies BtrBlocks
 /// compression as part of its default write strategy regardless of the in-memory tree
@@ -301,11 +303,7 @@ pub struct PreparedVariant {
 /// compressed tree — the disk-size comparison collapses two conceptually different
 /// things into one number. Reporting `nbytes()` of the in-memory tree keeps the size
 /// measurement consistent with what the *compute* measurements operate on.
-pub fn prepare_variant(
-    prepared: &PreparedDataset,
-    variant: Variant,
-    session: &VortexSession,
-) -> Result<PreparedVariant> {
+pub fn prepare_variant(prepared: &PreparedDataset, variant: Variant) -> Result<PreparedVariant> {
     match variant {
         Variant::VortexUncompressed => {
             // Identity: the uncompressed Extension<Vector> is already materialized. Still
@@ -333,7 +331,7 @@ pub fn prepare_variant(
             })
         }
         Variant::VortexTurboQuant => {
-            let mut ctx = session.create_execution_ctx();
+            let mut ctx = SESSION.create_execution_ctx();
             let start = Instant::now();
             let array = compress_turboquant(prepared.uncompressed.clone(), &mut ctx)?;
             let compress_duration = start.elapsed();
@@ -359,13 +357,13 @@ pub fn prepare_variant(
 /// interleaved form makes each stage see roughly the same cache state every
 /// iteration.
 ///
-/// Each stage still gets a fresh `ExecutionCtx`, so no cached scalar-fn state leaks
-/// between stages within a single iteration.
+/// Each stage still gets a fresh `ExecutionCtx` (from the global
+/// [`vortex_bench::SESSION`]), so no cached scalar-fn state leaks between stages
+/// within a single iteration.
 pub fn run_timings(
     variant_array: &ArrayRef,
     query: &[f32],
     iterations: usize,
-    session: &VortexSession,
 ) -> Result<VariantTimings> {
     let mut decompress = Duration::MAX;
     let mut cosine = Duration::MAX;
@@ -373,21 +371,21 @@ pub fn run_timings(
 
     for _ in 0..iterations {
         {
-            let mut ctx = session.create_execution_ctx();
+            let mut ctx = SESSION.create_execution_ctx();
             let start = Instant::now();
             let decoded: FixedSizeListArray = decompress_full_scan(variant_array, &mut ctx)?;
             decompress = decompress.min(start.elapsed());
             drop(decoded);
         }
         {
-            let mut ctx = session.create_execution_ctx();
+            let mut ctx = SESSION.create_execution_ctx();
             let start = Instant::now();
             let scores: PrimitiveArray = execute_cosine(variant_array, query, &mut ctx)?;
             cosine = cosine.min(start.elapsed());
             drop(scores);
         }
         {
-            let mut ctx = session.create_execution_ctx();
+            let mut ctx = SESSION.create_execution_ctx();
             let start = Instant::now();
             let matches: BoolArray =
                 execute_filter(variant_array, query, DEFAULT_THRESHOLD, &mut ctx)?;
@@ -515,8 +513,6 @@ pub(crate) mod test_utils {
 
 #[cfg(test)]
 mod tests {
-    use vortex_bench::SESSION;
-
     use super::test_utils::synthetic_vector;
     use super::*;
 
@@ -577,7 +573,7 @@ mod tests {
             Variant::VortexDefault,
             Variant::VortexTurboQuant,
         ] {
-            let prep = prepare_variant(&prepared, variant, &SESSION).unwrap();
+            let prep = prepare_variant(&prepared, variant).unwrap();
             assert_eq!(
                 prep.array.len(),
                 num_rows,
@@ -585,7 +581,7 @@ mod tests {
             );
             assert!(prep.nbytes > 0, "variant {variant:?} reported zero size");
 
-            let timings = run_timings(&prep.array, &prepared.query, 2, &SESSION).unwrap();
+            let timings = run_timings(&prep.array, &prepared.query, 2).unwrap();
             // TurboQuant + default must do real work; uncompressed's decompress is a
             // no-op and can plausibly time as zero.
             assert!(timings.cosine > Duration::ZERO);
@@ -603,14 +599,11 @@ mod tests {
         let num_rows = 256usize;
         let prepared = test_prepared(dim, num_rows, 0xDEADBEEF);
 
-        let uncompressed_prep =
-            prepare_variant(&prepared, Variant::VortexUncompressed, &SESSION).unwrap();
-        let turboquant_prep =
-            prepare_variant(&prepared, Variant::VortexTurboQuant, &SESSION).unwrap();
+        let uncompressed_prep = prepare_variant(&prepared, Variant::VortexUncompressed).unwrap();
+        let turboquant_prep = prepare_variant(&prepared, Variant::VortexTurboQuant).unwrap();
 
-        let unc_timings =
-            run_timings(&uncompressed_prep.array, &prepared.query, 3, &SESSION).unwrap();
-        let tq_timings = run_timings(&turboquant_prep.array, &prepared.query, 3, &SESSION).unwrap();
+        let unc_timings = run_timings(&uncompressed_prep.array, &prepared.query, 3).unwrap();
+        let tq_timings = run_timings(&turboquant_prep.array, &prepared.query, 3).unwrap();
 
         // The uncompressed decompress should be at least an order of magnitude faster
         // than TurboQuant's (usually many orders of magnitude). 5x is a loose lower
@@ -644,7 +637,7 @@ mod tests {
             Variant::VortexDefault,
             Variant::VortexTurboQuant,
         ] {
-            let prep = prepare_variant(&prepared, variant, &SESSION).unwrap();
+            let prep = prepare_variant(&prepared, variant).unwrap();
             println!("=== {variant:?} ===");
             println!("  len              : {}", prep.array.len());
             println!("  nbytes           : {}", prep.nbytes);
diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs
index 261d07becb3..5e78563ef22 100644
--- a/benchmarks/vector-search-bench/src/main.rs
+++ b/benchmarks/vector-search-bench/src/main.rs
@@ -29,8 +29,6 @@ use clap::Parser;
 use indicatif::ProgressBar;
 use vector_search_bench::DEFAULT_THRESHOLD;
 use vector_search_bench::Variant;
-use vector_search_bench::handrolled_baseline::cosine_loop;
-use vector_search_bench::handrolled_baseline::read_parquet_embedding_column;
 use vector_search_bench::handrolled_baseline::run_handrolled_baseline_timings;
 use vector_search_bench::prepare_dataset;
 use vector_search_bench::prepare_variant;
@@ -42,7 +40,6 @@ use vector_search_bench::verify::compute_cosine_scores;
 use vector_search_bench::verify::verify_and_report_scores;
 use vector_search_bench::verify::verify_variant;
 use vortex_bench::Format;
-use vortex_bench::SESSION;
 use vortex_bench::create_output_writer;
 use vortex_bench::datasets::Dataset;
 use vortex_bench::display::DisplayFormat;
@@ -202,9 +199,8 @@ async fn main() -> Result<()> {
         // Ground-truth cosine scores for the verification query — the scores produced by
         // the uncompressed Vortex scan. Every other variant (including the hand-rolled
         // baseline) will be compared against this.
-        let baseline_scores =
-            compute_cosine_scores(&prepared.uncompressed, &prepared.query, &SESSION)
-                .context("compute ground-truth cosine scores for verification")?;
+        let baseline_scores = compute_cosine_scores(&prepared.uncompressed, &prepared.query)
+            .context("compute ground-truth cosine scores for verification")?;
         tracing::info!(
             "computed {} ground-truth cosine scores for {}",
             baseline_scores.len(),
@@ -222,26 +218,30 @@ async fn main() -> Result<()> {
         // parquet on disk — only the *compute* is hand-rolled. The metric `name` field
         // carries the `handrolled` label so human readers can tell the compute apart
         // from, say, a DuckDB `list_cosine_similarity` baseline on the same parquet.
+        //
+        // Timing runs first and returns the cosine scores from its final iteration;
+        // verification then reuses those scores rather than re-reading the parquet
+        // file. `cosine_loop` is deterministic, so the last-iteration scores equal
+        // what a separate pre-timing verification pass would produce — we just save
+        // one parquet read per dataset. If the scores drift from the Vortex baseline,
+        // `verify_and_report_scores` bails here (after the timing already ran, which
+        // is acceptable because the handrolled loop is cheap and we'd rather run it
+        // twice than skip correctness).
         if run_handrolled_baseline {
             let parquet_path = dataset.to_parquet_path().await?;
             let label = "handrolled";
             let bench_name = format!("{label}/{}", prepared.name);
 
-            // Verify the handrolled cosine scores against the Vortex baseline before
-            // any timing starts. `verify_and_report_scores` is the same helper the
-            // Vortex-variant loop ends up calling through `verify_variant`, so the
-            // two paths share all their pass/fail / log / bail logic.
-            let baseline_data = read_parquet_embedding_column(&parquet_path)
-                .context("read parquet emb column for verification")?;
-            let handrolled_scores = cosine_loop(
-                &baseline_data.elements,
-                baseline_data.num_rows,
-                baseline_data.dim,
+            let baseline_result = run_handrolled_baseline_timings(
+                &parquet_path,
                 &prepared.query,
-            );
+                DEFAULT_THRESHOLD,
+                args.iterations,
+            )?;
+
             let handrolled_report = verify_and_report_scores(
                 &bench_name,
-                &handrolled_scores,
+                &baseline_result.last_scores,
                 &baseline_scores,
                 VerificationKind::Lossless,
             )?;
@@ -258,13 +258,6 @@ async fn main() -> Result<()> {
                 value: handrolled_report.max_abs_diff,
             });
 
-            let baseline_timings = run_handrolled_baseline_timings(
-                &parquet_path,
-                &prepared.query,
-                DEFAULT_THRESHOLD,
-                args.iterations,
-            )?;
-
             sizes.push(CustomUnitMeasurement {
                 name: format!("{label} size/{}", prepared.name),
                 format: Format::Parquet,
@@ -274,24 +267,24 @@ async fn main() -> Result<()> {
             timings.push(CompressionTimingMeasurement {
                 name: format!("decompress time/{bench_name}"),
                 format: Format::Parquet,
-                time: baseline_timings.decompress,
+                time: baseline_result.timings.decompress,
             });
             timings.push(CompressionTimingMeasurement {
                 name: format!("cosine-similarity time/{bench_name}"),
                 format: Format::Parquet,
-                time: baseline_timings.cosine,
+                time: baseline_result.timings.cosine,
             });
             timings.push(CompressionTimingMeasurement {
                 name: format!("cosine-filter time/{bench_name}"),
                 format: Format::Parquet,
-                time: baseline_timings.filter,
+                time: baseline_result.timings.filter,
             });
 
             progress.inc(1);
         }
 
         for &variant in &variants {
-            let prep = prepare_variant(&prepared, variant, &SESSION)?;
+            let prep = prepare_variant(&prepared, variant)?;
 
             let variant_label = variant.label();
             let bench_name = format!("{variant_label}/{}", prepared.name);
@@ -311,7 +304,6 @@ async fn main() -> Result<()> {
                 &prepared.query,
                 &baseline_scores,
                 kind,
-                &SESSION,
             )?;
             tracing::info!(
                 "{} verification ({:?}): max_abs_diff={:.2e}, mean_abs_diff={:.2e}",
@@ -345,8 +337,7 @@ async fn main() -> Result<()> {
                 time: prep.compress_duration,
             });
 
-            let variant_timings =
-                run_timings(&prep.array, &prepared.query, args.iterations, &SESSION)?;
+            let variant_timings = run_timings(&prep.array, &prepared.query, args.iterations)?;
 
             timings.push(CompressionTimingMeasurement {
                 name: format!("decompress time/{bench_name}"),
@@ -373,7 +364,6 @@ async fn main() -> Result<()> {
                     &prep.array,
                     args.recall_queries,
                     args.recall_k,
-                    &SESSION,
                 )?;
                 tracing::info!("Recall@{} for {}: {:.4}", args.recall_k, bench_name, recall);
                 recalls.push(CustomUnitMeasurement {
diff --git a/benchmarks/vector-search-bench/src/recall.rs b/benchmarks/vector-search-bench/src/recall.rs
index e4fbd721f28..fb1dff4dc8c 100644
--- a/benchmarks/vector-search-bench/src/recall.rs
+++ b/benchmarks/vector-search-bench/src/recall.rs
@@ -16,7 +16,6 @@
 
 use anyhow::Result;
 use vortex::array::ArrayRef;
-use vortex::session::VortexSession;
 use vortex::utils::aliases::hash_set::HashSet;
 
 use crate::extract_query_row;
@@ -26,7 +25,8 @@ use crate::verify::compute_cosine_scores;
 pub const DEFAULT_TOP_K: usize = 10;
 
 /// Compute recall@K for the lossy `compressed` variant against the `uncompressed`
-/// ground-truth variant, averaged over `num_queries` sampled query rows.
+/// ground-truth variant, averaged over `num_queries` sampled query rows. Uses the
+/// global [`vortex_bench::SESSION`] for all executions.
 ///
 /// Query selection is deterministic: rows are picked uniformly across the dataset at
 /// `step = uncompressed.len() / num_queries` intervals. This keeps the result stable
@@ -36,7 +36,6 @@ pub fn measure_recall_at_k(
     compressed: &ArrayRef,
     num_queries: usize,
     top_k: usize,
-    session: &VortexSession,
 ) -> Result<f64> {
     assert!(
         num_queries > 0,
@@ -60,10 +59,10 @@ pub fn measure_recall_at_k(
         let row = (q * step).min(num_rows - 1);
         let query = extract_query_row(uncompressed, row)?;
 
-        let gt_scores = compute_cosine_scores(uncompressed, &query, session)?;
+        let gt_scores = compute_cosine_scores(uncompressed, &query)?;
         let truth = top_k_indices(&gt_scores, top_k);
 
-        let lossy_scores = compute_cosine_scores(compressed, &query, session)?;
+        let lossy_scores = compute_cosine_scores(compressed, &query)?;
         let lossy = top_k_indices(&lossy_scores, top_k);
 
         let truth_set: HashSet<usize> = truth.iter().copied().collect();
@@ -90,8 +89,6 @@ fn top_k_indices(scores: &[f32], top_k: usize) -> Vec<usize> {
 
 #[cfg(test)]
 mod tests {
-    use vortex_bench::SESSION;
-
     use super::*;
     use crate::Variant;
     use crate::extract_query_row;
@@ -128,7 +125,7 @@ mod tests {
         let num_rows = 64usize;
         let uncompressed = synthetic_vector(dim, num_rows, 0xC0FFEE);
 
-        let recall = measure_recall_at_k(&uncompressed, &uncompressed, 4, 10, &SESSION).unwrap();
+        let recall = measure_recall_at_k(&uncompressed, &uncompressed, 4, 10).unwrap();
         assert!(
             (recall - 1.0).abs() < 1e-9,
             "self-recall must be 1.0, got {recall}"
@@ -151,12 +148,12 @@ mod tests {
             parquet_bytes: 0,
         };
 
-        let tq_prep = prepare_variant(&prepared, Variant::VortexTurboQuant, &SESSION).unwrap();
+        let tq_prep = prepare_variant(&prepared, Variant::VortexTurboQuant).unwrap();
 
         // With only 64 random rows, recall@10 won't be 1.0 but it should be well
         // above chance (10/64 ≈ 0.156). The test asserts a loose lower bound to catch
         // total regressions without being flaky on distribution noise.
-        let recall = measure_recall_at_k(&uncompressed, &tq_prep.array, 4, 10, &SESSION).unwrap();
+        let recall = measure_recall_at_k(&uncompressed, &tq_prep.array, 4, 10).unwrap();
         assert!(
             recall >= 0.3,
             "TurboQuant recall@10 on 64×128 synthetic data should be ≥0.3, got {recall}",
diff --git a/benchmarks/vector-search-bench/src/verify.rs b/benchmarks/vector-search-bench/src/verify.rs
index ca58ff44b13..da2a73f1ea2 100644
--- a/benchmarks/vector-search-bench/src/verify.rs
+++ b/benchmarks/vector-search-bench/src/verify.rs
@@ -25,7 +25,7 @@ use anyhow::bail;
 use vortex::array::ArrayRef;
 use vortex::array::VortexSessionExecute;
 use vortex::dtype::PType;
-use vortex::session::VortexSession;
+use vortex_bench::SESSION;
 
 use crate::execute_cosine;
 
@@ -81,7 +81,7 @@ impl VerificationReport {
 /// Compute cosine-similarity scores for a single query row on `data` and return them
 /// as a plain `Vec<f32>`. This is a convenience wrapper around
 /// [`crate::execute_cosine`] that pulls the f32 slice out of the resulting
-/// `PrimitiveArray`.
+/// `PrimitiveArray`. Uses the global [`vortex_bench::SESSION`].
 ///
 /// # Errors
 ///
@@ -90,12 +90,8 @@ impl VerificationReport {
 /// happen today because the benchmark only wires `f32` `Vector` columns, but the
 /// explicit ptype check keeps the function sound if the scalar-fn output type ever
 /// widens (e.g. to `f64`) without the caller noticing.
-pub fn compute_cosine_scores(
-    data: &ArrayRef,
-    query: &[f32],
-    session: &VortexSession,
-) -> Result<Vec<f32>> {
-    let mut ctx = session.create_execution_ctx();
+pub fn compute_cosine_scores(data: &ArrayRef, query: &[f32]) -> Result<Vec<f32>> {
+    let mut ctx = SESSION.create_execution_ctx();
     let scores = execute_cosine(data, query, &mut ctx)?;
     if scores.ptype() != PType::F32 {
         bail!(
@@ -203,23 +199,20 @@ pub fn verify_and_report_scores(
 /// same query used for the baseline and returns a [`VerificationReport`]. Returns
 /// `Err` if `kind` is [`VerificationKind::Lossless`] and the scores disagree beyond
 /// [`LOSSLESS_TOLERANCE`] — that indicates a real correctness bug, not a quality
-/// tradeoff.
+/// tradeoff. Uses the global [`vortex_bench::SESSION`].
 pub fn verify_variant(
     variant_name: &str,
     variant_array: &ArrayRef,
     query: &[f32],
     baseline_scores: &[f32],
     kind: VerificationKind,
-    session: &VortexSession,
 ) -> Result<VerificationReport> {
-    let scores = compute_cosine_scores(variant_array, query, session)?;
+    let scores = compute_cosine_scores(variant_array, query)?;
     verify_and_report_scores(variant_name, &scores, baseline_scores, kind)
 }
 
 #[cfg(test)]
 mod tests {
-    use vortex_bench::SESSION;
-
     use super::*;
     use crate::Variant;
     use crate::extract_query_row;
@@ -367,16 +360,15 @@ mod tests {
         let prepared = make_prepared(dim, num_rows, 0xC0FFEE);
 
         let baseline_scores =
-            compute_cosine_scores(&prepared.uncompressed, &prepared.query, &SESSION).unwrap();
+            compute_cosine_scores(&prepared.uncompressed, &prepared.query).unwrap();
 
-        let default_prep = prepare_variant(&prepared, Variant::VortexDefault, &SESSION).unwrap();
+        let default_prep = prepare_variant(&prepared, Variant::VortexDefault).unwrap();
         let report = verify_variant(
             "vortex-default",
             &default_prep.array,
             &prepared.query,
             &baseline_scores,
             VerificationKind::Lossless,
-            &SESSION,
         )
         .expect("vortex-default must be lossless against the uncompressed baseline");
         assert!(report.passed);
@@ -389,16 +381,15 @@ mod tests {
         let prepared = make_prepared(dim, num_rows, 0xDEADBEEF);
 
         let baseline_scores =
-            compute_cosine_scores(&prepared.uncompressed, &prepared.query, &SESSION).unwrap();
+            compute_cosine_scores(&prepared.uncompressed, &prepared.query).unwrap();
 
-        let tq_prep = prepare_variant(&prepared, Variant::VortexTurboQuant, &SESSION).unwrap();
+        let tq_prep = prepare_variant(&prepared, Variant::VortexTurboQuant).unwrap();
         let report = verify_variant(
             "vortex-turboquant",
             &tq_prep.array,
             &prepared.query,
             &baseline_scores,
             VerificationKind::Lossy,
-            &SESSION,
         )
         .expect("TurboQuant verification should not error");
         assert!(
diff --git a/vortex-bench/src/conversions.rs b/vortex-bench/src/conversions.rs
index 70330e77f91..9811abc903d 100644
--- a/vortex-bench/src/conversions.rs
+++ b/vortex-bench/src/conversions.rs
@@ -549,4 +549,33 @@ mod tests {
             "unexpected error: {err}",
         );
     }
+
+    #[test]
+    fn nullable_element_dtype_is_rejected() {
+        // Build a `List<f32?>` — a list whose elements have nullable dtype (even
+        // if every value happens to be present). The `Vector` extension type at the
+        // FSL level requires non-nullable elements, so this must be rejected.
+        //
+        // Passing `Validity::AllValid` to `PrimitiveArray::new` sets the ptype's
+        // nullability to `Nullable`, which is what triggers the rejection path even
+        // though every value is technically valid.
+        let elements = PrimitiveArray::new::<f32>(
+            BufferMut::<f32>::from_iter([1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0]).freeze(),
+            Validity::AllValid,
+        )
+        .into_array();
+        let offsets = PrimitiveArray::new::<i32>(
+            BufferMut::<i32>::from_iter([0i32, 3, 6]).freeze(),
+            Validity::NonNullable,
+        )
+        .into_array();
+        let list = vortex::array::Array::<List>::new(elements, offsets, Validity::NonNullable)
+            .into_array();
+
+        let err = list_to_vector_ext(list).unwrap_err().to_string();
+        assert!(
+            err.contains("element type must be non-nullable"),
+            "unexpected error: {err}"
+        );
+    }
 }