From f8c94de3aa203fa3b64b8c1ee17e7b33e83ec2db Mon Sep 17 00:00:00 2001
From: Connor Tsui <connor.tsui20@gmail.com>
Date: Sat, 11 Apr 2026 17:32:06 -0400
Subject: [PATCH 1/3] vector search benchmarks

Co-authored-by: Claude <noreply@anthropic.com>
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 .github/workflows/bench-pr.yml                |   2 +
 .github/workflows/bench.yml                   |   4 +
 Cargo.lock                                    |  21 +
 Cargo.toml                                    |   1 +
 benchmarks/datafusion-bench/src/lib.rs        |   2 +-
 benchmarks/vector-search-bench/Cargo.toml     |  36 +
 benchmarks/vector-search-bench/README.md      | 114 +++
 .../src/handrolled_baseline.rs                | 401 +++++++++++
 benchmarks/vector-search-bench/src/lib.rs     | 663 ++++++++++++++++++
 benchmarks/vector-search-bench/src/main.rs    | 415 +++++++++++
 benchmarks/vector-search-bench/src/recall.rs  | 162 +++++
 benchmarks/vector-search-bench/src/verify.rs  | 402 +++++++++++
 vortex-bench/Cargo.toml                       |   1 +
 vortex-bench/src/conversions.rs               | 357 ++++++++++
 vortex-bench/src/lib.rs                       |  10 +
 vortex-bench/src/measurements.rs              |   6 +-
 vortex-bench/src/vector_dataset.rs            | 241 +++++++
 .../benches/similarity_search_common/mod.rs   | 100 +--
 vortex-tensor/public-api.lock                 |   8 +
 vortex-tensor/src/lib.rs                      |   2 +
 vortex-tensor/src/vector_search.rs            | 298 ++++++++
 21 files changed, 3148 insertions(+), 98 deletions(-)
 create mode 100644 benchmarks/vector-search-bench/Cargo.toml
 create mode 100644 benchmarks/vector-search-bench/README.md
 create mode 100644 benchmarks/vector-search-bench/src/handrolled_baseline.rs
 create mode 100644 benchmarks/vector-search-bench/src/lib.rs
 create mode 100644 benchmarks/vector-search-bench/src/main.rs
 create mode 100644 benchmarks/vector-search-bench/src/recall.rs
 create mode 100644 benchmarks/vector-search-bench/src/verify.rs
 create mode 100644 vortex-bench/src/vector_dataset.rs
 create mode 100644 vortex-tensor/src/vector_search.rs

diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml
index f392f1a6e45..0d16b7dd353 100644
--- a/.github/workflows/bench-pr.yml
+++ b/.github/workflows/bench-pr.yml
@@ -33,6 +33,8 @@ jobs:
             build_args: "--features lance"
           - id: compress-bench
             name: Compression
+          - id: vector-search-bench
+            name: Vector Similarity Search
     steps:
       - uses: runs-on/action@v2
         if: github.event.pull_request.head.repo.fork == false
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 7be829d80bd..f900022e2f6 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -48,6 +48,10 @@ jobs:
             name: Compression
             build_args: "--features lance"
             formats: "parquet,lance,vortex"
+          - id: vector-search-bench
+            name: Vector Similarity Search
+            build_args: ""
+            formats: "handrolled,vortex-uncompressed,vortex-default,vortex-turboquant"
     steps:
       - uses: runs-on/action@v2
         if: github.repository == 'vortex-data/vortex'
diff --git a/Cargo.lock b/Cargo.lock
index 0b84a2f407a..aba600c1237 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10032,6 +10032,26 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
 
+[[package]]
+name = "vector-search-bench"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "arrow-array 58.0.0",
+ "arrow-buffer 58.0.0",
+ "arrow-schema 58.0.0",
+ "clap",
+ "indicatif",
+ "parquet 58.0.0",
+ "tempfile",
+ "tokio",
+ "tracing",
+ "vortex",
+ "vortex-bench",
+ "vortex-btrblocks",
+ "vortex-tensor",
+]
+
 [[package]]
 name = "version_check"
 version = "0.9.5"
@@ -10221,6 +10241,7 @@ dependencies = [
  "url",
  "uuid",
  "vortex",
+ "vortex-tensor",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index f486a90b3c7..e293311d222 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -59,6 +59,7 @@ members = [
     "benchmarks/datafusion-bench",
     "benchmarks/duckdb-bench",
     "benchmarks/random-access-bench",
+    "benchmarks/vector-search-bench",
 ]
 exclude = ["java/testfiles", "wasm-test"]
 resolver = "2"
diff --git a/benchmarks/datafusion-bench/src/lib.rs b/benchmarks/datafusion-bench/src/lib.rs
index 469c9be6177..8b4815e082c 100644
--- a/benchmarks/datafusion-bench/src/lib.rs
+++ b/benchmarks/datafusion-bench/src/lib.rs
@@ -114,7 +114,7 @@ pub fn format_to_df_format(format: Format) -> Arc<dyn FileFormat> {
         Format::Csv => Arc::new(CsvFormat::default()) as _,
         Format::Arrow => Arc::new(ArrowFormat),
         Format::Parquet => Arc::new(ParquetFormat::new()),
-        Format::OnDiskVortex | Format::VortexCompact => {
+        Format::OnDiskVortex | Format::VortexCompact | Format::VortexLossy => {
             Arc::new(VortexFormat::new(SESSION.clone()))
         }
         Format::OnDiskDuckDB | Format::Lance => {
diff --git a/benchmarks/vector-search-bench/Cargo.toml b/benchmarks/vector-search-bench/Cargo.toml
new file mode 100644
index 00000000000..bbc620572b5
--- /dev/null
+++ b/benchmarks/vector-search-bench/Cargo.toml
@@ -0,0 +1,36 @@
+[package]
+name = "vector-search-bench"
+description = "Vector similarity search benchmarks for Vortex on public embedding datasets"
+authors.workspace = true
+categories.workspace = true
+edition.workspace = true
+homepage.workspace = true
+include.workspace = true
+keywords.workspace = true
+license.workspace = true
+readme.workspace = true
+repository.workspace = true
+rust-version.workspace = true
+version.workspace = true
+publish = false
+
+[dependencies]
+anyhow = { workspace = true }
+arrow-array = { workspace = true }
+arrow-buffer = { workspace = true }
+arrow-schema = { workspace = true }
+clap = { workspace = true, features = ["derive"] }
+indicatif = { workspace = true }
+parquet = { workspace = true }
+tokio = { workspace = true, features = ["full"] }
+tracing = { workspace = true }
+vortex = { workspace = true }
+vortex-bench = { workspace = true }
+vortex-btrblocks = { workspace = true }
+vortex-tensor = { workspace = true }
+
+[dev-dependencies]
+tempfile = { workspace = true }
+
+[lints]
+workspace = true
diff --git a/benchmarks/vector-search-bench/README.md b/benchmarks/vector-search-bench/README.md
new file mode 100644
index 00000000000..7acf1543f8f
--- /dev/null
+++ b/benchmarks/vector-search-bench/README.md
@@ -0,0 +1,114 @@
+# vector-search-bench
+
+Brute-force cosine-similarity benchmark for Vortex on public VectorDBBench
+embedding corpora.
+
+## What it measures
+
+For each `(dataset, format)` pair, the benchmark records:
+
+1. **`nbytes`** — in-memory footprint of the variant's array tree, in bytes.
+   Reporting the in-memory `.nbytes()` instead of an on-disk file size is
+   deliberate: the Vortex default write path runs BtrBlocks on every tree
+   regardless of whether it's already compressed, so "on-disk size" would
+   collapse `vortex-uncompressed` and `vortex-default` to the same bytes
+   even though their in-memory trees are different. The `nbytes()`
+   number is consistent with what the *compute* measurements actually
+   operate on.
+   - The `handrolled` baseline reports the canonical parquet file size
+     on disk — that's the only encoded representation it has.
+2. **Compress time** — wall time to build the variant tree from the
+   materialized uncompressed source. ~0 for `vortex-uncompressed` (identity),
+   meaningful for the two compressed variants.
+3. **Decompress time** — wall time to execute the variant tree all the way
+   back into a canonical `FixedSizeListArray<f32>` with a materialized f32
+   element buffer. For `vortex-uncompressed` this is a no-op; for
+   `vortex-default` it includes ALP-RD bit-unpacking; for
+   `vortex-turboquant` it includes the inverse SORF rotation and
+   dictionary lookup.
+4. **Cosine-similarity time** — `CosineSimilarity(data, const_query)`
+   executed to a materialized f32 array.
+5. **Cosine-filter time** — `Binary(Gt, [CosineSimilarity, threshold])`
+   executed to a `BoolArray`.
+6. **Recall@10** (TurboQuant only) — the fraction of the exact top-10
+   nearest neighbours that TurboQuant recovers, using the uncompressed
+   Vortex scan as local ground truth.
+
+Before any timing starts, the benchmark runs a **correctness verification
+pass**: cosine scores for a single query are computed against every
+variant and compared to the uncompressed baseline. Lossless variants must
+match within `1e-4` max-abs-diff; TurboQuant must stay within `0.2`. A
+mismatch bails the run — you cannot publish throughput numbers for a
+variant that returns wrong answers.
+
+## Formats
+
+- `handrolled` — Hand-rolled Rust scalar cosine loop over a flat
+  `Vec<f32>` that was decoded from the canonical parquet file via
+  `parquet-rs` / `arrow-rs`. The **decompress** phase does the parquet
+  read, downcasts to `Float32Array`, and memcpies into a plain `Vec<f32>`.
+  The **compute** phase is a plain scalar loop over `&[f32]` — no Arrow
+  compute kernels, no scalar-function dispatch, no SIMD annotations.
+
+  This is a **compute-cost floor**, not a realistic parquet-on-DBMS
+  baseline. It answers the question "what's the minimum cost you could
+  get away with if you wrote a vector-search scan by hand with no query
+  engine?" Real parquet users would pay substantially more (DuckDB
+  `list_cosine_similarity`, DataFusion with a vector UDF, etc.) —
+  adding those as additional baselines is a natural v2 direction.
+- `vortex-uncompressed` — Raw `Vector<dim, f32>` extension array, no
+  encoding-level compression applied.
+- `vortex-default` — `BtrBlocksCompressor::default()` applied to the FSL
+  storage child. On float vectors this typically finds ~15% lossless
+  savings via ALP-RD (mantissa/exponent split + bitpacking).
+- `vortex-turboquant` — The full
+  `L2Denorm(SorfTransform(FSL(Dict(codes, centroids))), norms)` pipeline.
+  Lossy; recall@10 is reported alongside throughput. At the default 8-bit
+  config this typically gives ~3× storage reduction at >90% top-10
+  recall.
+
+## Datasets
+
+The smallest built-in dataset is **Cohere-100K** (`cohere-small`): 100K
+rows × 768 dims, cosine metric, ~150 MB zstd-parquet. It's the smallest
+VectorDBBench-supplied corpus that still exercises every encoding path.
+Larger variants (`cohere-medium`, `openai-small`, `openai-medium`,
+`bioasq-medium`, `glove-medium`) are wired up for local / on-demand
+experiments; see `vortex-bench/src/vector_dataset.rs` for the full list.
+
+The upstream URL for Cohere-100K is
+`https://assets.zilliz.com/benchmark/cohere_small_100k/train.parquet`.
+The public Zilliz bucket is anonymous-readable so the code can hit it
+directly.
+
+## Running locally
+
+```bash
+cargo run -p vector-search-bench --release -- \
+    --datasets cohere-small \
+    --formats handrolled,vortex-uncompressed,vortex-default,vortex-turboquant \
+    --iterations 5 \
+    -d table
+```
+
+The first run downloads the parquet file into
+`vortex-bench/data/cohere-small/cohere-small.parquet` and caches it
+idempotently for subsequent runs.
+
+## CI note: dataset mirror
+
+CI runs after every develop-branch merge. Hitting `assets.zilliz.com`
+from every merge would create recurring egress traffic on a third-party
+bucket — the same courtesy reason `RPlace` / `AirQuality` are excluded
+from CI in `compress-bench`.
+
+Before enabling the `vector-search-bench` entry in `.github/workflows/bench.yml`
+on a fork, either:
+
+1. **Mirror the file into an internal bucket** and swap the URL in
+   `vortex-bench/src/vector_dataset.rs::VectorDataset::parquet_url`, or
+2. **Accept the upstream egress cost** and leave the URL as-is.
+
+The mirror step is a one-off `aws s3 cp` and is documented here rather
+than automated in the build because the destination bucket is
+organization-specific.
diff --git a/benchmarks/vector-search-bench/src/handrolled_baseline.rs b/benchmarks/vector-search-bench/src/handrolled_baseline.rs
new file mode 100644
index 00000000000..f62c5198fb6
--- /dev/null
+++ b/benchmarks/vector-search-bench/src/handrolled_baseline.rs
@@ -0,0 +1,401 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Hand-rolled Rust cosine similarity baseline.
+//!
+//! This module provides the *compute-cost floor* the other Vortex variants are measured
+//! against. It is **not** a realistic "parquet in a DBMS" baseline — it's the minimum
+//! amount of work a Rust programmer could get away with if they wrote a vector-search
+//! scan by hand with no query engine, no scalar-function dispatch, and no Arrow compute
+//! kernels.
+//!
+//! Two distinct phases run per iteration, and the benchmark times them separately so the
+//! dashboard can separate storage-read cost from compute cost:
+//!
+//! 1. **Decompress** ([`read_parquet_embedding_column`]) — reads the canonical parquet
+//!    file via `parquet-rs`, downcasts the `emb` column to an Arrow `Float32Array`, and
+//!    copies every value into a flat `Vec<f32>`. This phase is the only place Arrow is
+//!    actually used — only for the decode. The `memcpy` at the end is incidental: we
+//!    could operate directly on `Float32Array::values()` with identical performance,
+//!    but taking ownership of a `Vec<f32>` frees the Arrow `RecordBatch` lifetimes.
+//! 2. **Compute** ([`cosine_loop`] and [`filter_loop`]) — runs a plain scalar Rust loop
+//!    over `&[f32]`. Arrow is no longer involved. There's no SIMD, no unrolling
+//!    annotations, no dispatch overhead, no output-array allocation beyond a single
+//!    `Vec<f32>`. This is deliberately "the fastest you could possibly make it go
+//!    without writing SIMD intrinsics".
+//!
+//! Calling this "the parquet baseline" would be misleading, because:
+//!
+//! - The compute layer has nothing to do with parquet — parquet is only the input
+//!   encoding, not the execution substrate.
+//! - Real parquet-on-DBMS engines (DuckDB's `list_cosine_similarity`, DataFusion with a
+//!   vector UDF, etc.) would pay substantial dispatch / planner / row-iterator cost
+//!   that this loop skips entirely.
+//!
+//! Think of it as: "If you didn't have Vortex and didn't feel like reaching for a query
+//! engine, what's the minimum scan cost you could get away with on this data?" That's
+//! the question this module answers, and it's intentionally a lower bound rather than a
+//! fair DBMS comparison. Future work could add DuckDB / DataFusion baselines alongside
+//! this one for the DBMS-level comparison.
+
+use std::fs::File;
+use std::path::Path;
+use std::time::Duration;
+use std::time::Instant;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::bail;
+use arrow_array::Array;
+use arrow_array::FixedSizeListArray;
+use arrow_array::Float32Array;
+use arrow_array::ListArray;
+use arrow_array::RecordBatch;
+use arrow_array::cast::AsArray;
+use arrow_schema::DataType;
+use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+
+use crate::VariantTimings;
+
+/// Read the entire `emb` column of a parquet file into a single flat `Vec<f32>`, along
+/// with the dimension and row count. This is the *decompress* phase of the hand-rolled
+/// baseline — it's the only place Arrow is actually used. `parquet-rs` does the file
+/// decode, we downcast to `Float32Array`, and then memcpy into a plain `Vec<f32>` so
+/// the compute loop can operate over a raw slice without holding any Arrow
+/// `RecordBatch` references.
+///
+/// Kept under its `parquet` name because this function *actually reads parquet*; only
+/// the compute-side wrappers take the `handrolled` label.
+pub fn read_parquet_embedding_column(parquet_path: &Path) -> Result<HandrolledBaselineData> {
+    let file = File::open(parquet_path)
+        .with_context(|| format!("open parquet file {}", parquet_path.display()))?;
+    let builder = ParquetRecordBatchReaderBuilder::try_new(file)?;
+
+    // Locate the `emb` column and sanity-check its type.
+    let (_, emb_field) = builder
+        .schema()
+        .column_with_name("emb")
+        .context("parquet schema missing `emb` column")?;
+
+    // VectorDBBench parquet files use `list<float>`; some others use `fixed_size_list`.
+    // Both need to be supported — the canonical parquet emit from arrow-rs is `list<f32>`
+    // since parquet has no fixed-size-list logical type.
+    let element_dtype = match emb_field.data_type() {
+        DataType::List(field) | DataType::LargeList(field) | DataType::FixedSizeList(field, _) => {
+            field.data_type().clone()
+        }
+        other => bail!("emb column must be a list of float, got {other:?}"),
+    };
+    if !matches!(element_dtype, DataType::Float32) {
+        bail!(
+            "emb column element type must be Float32, got {:?}",
+            element_dtype
+        );
+    }
+
+    let reader = builder.build()?;
+    let batches: Vec<RecordBatch> = reader.collect::<Result<Vec<_>, _>>()?;
+
+    let mut data = Vec::<f32>::new();
+    let mut num_rows = 0usize;
+    let mut inferred_dim: Option<usize> = None;
+
+    for batch in batches.iter() {
+        let column = batch
+            .column_by_name("emb")
+            .context("emb column missing from record batch")?;
+        append_batch(column, &mut data, &mut inferred_dim, &mut num_rows)?;
+    }
+
+    let dim = inferred_dim.context("parquet file has zero rows — cannot infer dimension")?;
+    Ok(HandrolledBaselineData {
+        elements: data,
+        dim,
+        num_rows,
+    })
+}
+
+fn append_batch(
+    column: &dyn Array,
+    data: &mut Vec<f32>,
+    inferred_dim: &mut Option<usize>,
+    num_rows: &mut usize,
+) -> Result<()> {
+    if let Some(fsl) = column.as_any().downcast_ref::<FixedSizeListArray>() {
+        let dim = fsl.value_length() as usize;
+        maybe_set_dim(inferred_dim, dim)?;
+        let values = fsl
+            .values()
+            .as_any()
+            .downcast_ref::<Float32Array>()
+            .context("FSL emb column must have Float32 values")?;
+        data.extend_from_slice(values.values());
+        *num_rows += fsl.len();
+        return Ok(());
+    }
+
+    if let Some(list) = column.as_any().downcast_ref::<ListArray>() {
+        let values: &Float32Array = list
+            .values()
+            .as_primitive_opt::<arrow_array::types::Float32Type>()
+            .context("List emb column must have Float32 values")?;
+        let offsets = list.value_offsets();
+        for i in 0..list.len() {
+            let start = offsets[i] as usize;
+            let end = offsets[i + 1] as usize;
+            let row_len = end - start;
+            maybe_set_dim(inferred_dim, row_len)?;
+            data.extend_from_slice(&values.values()[start..end]);
+            *num_rows += 1;
+        }
+        return Ok(());
+    }
+
+    bail!(
+        "emb column has unsupported arrow type {:?}",
+        column.data_type()
+    );
+}
+
+fn maybe_set_dim(inferred_dim: &mut Option<usize>, new_dim: usize) -> Result<()> {
+    match inferred_dim {
+        Some(d) if *d == new_dim => Ok(()),
+        Some(d) => bail!("inconsistent emb dimensions: saw {d} then {new_dim}"),
+        None if new_dim == 0 => bail!("emb row has zero elements"),
+        None => {
+            *inferred_dim = Some(new_dim);
+            Ok(())
+        }
+    }
+}
+
+/// The flattened representation of an embedding column, suitable for a hand-rolled
+/// distance loop. Intentionally decoupled from any format — the compute side doesn't
+/// care how the data got into this `Vec<f32>`.
+///
+/// The benchmark's "size" measurement for the handrolled baseline comes from
+/// [`crate::PreparedDataset::parquet_bytes`] (which is populated once in
+/// [`crate::prepare_dataset`]), not from this struct. We deliberately don't carry
+/// the file size in here — doing so would duplicate state between two places that
+/// can go out of sync.
+pub struct HandrolledBaselineData {
+    /// All rows concatenated: `elements.len() == num_rows * dim`.
+    pub elements: Vec<f32>,
+    /// Vector dimensionality.
+    pub dim: usize,
+    /// Number of rows.
+    pub num_rows: usize,
+}
+
+/// Result of running the hand-rolled baseline timing loop.
+///
+/// Carries both the best-of-N timing numbers **and** the cosine scores from the final
+/// iteration. The scores are exposed so the caller can feed them into
+/// [`crate::verify::verify_and_report_scores`] for the correctness check without
+/// re-reading the parquet file. Because `cosine_loop` is deterministic, the scores
+/// from any iteration equal the scores from every other iteration; using the last
+/// one is simply the most convenient snapshot.
+pub struct HandrolledBaselineResult {
+    /// Best-of-N wall times for decompress / cosine / filter.
+    pub timings: VariantTimings,
+    /// Cosine-similarity scores from the final iteration. Length equals the dataset
+    /// row count.
+    pub last_scores: Vec<f32>,
+}
+
+/// Run the decompress / cosine / filter microbenchmarks for the hand-rolled baseline
+/// and return the best-of-N wall times along with the last iteration's cosine scores.
+///
+/// The decompress phase re-reads the parquet file from disk on each iteration (matches
+/// how the Vortex variants re-execute their tree from scratch each iteration), and the
+/// compute phase runs [`cosine_loop`] and [`filter_loop`] over the flat `Vec<f32>` the
+/// decompress phase produced. Returning the last iteration's scores lets the caller
+/// perform correctness verification against the Vortex baseline without a redundant
+/// parquet read.
+///
+/// # Panics
+///
+/// Panics if `iterations == 0`. The benchmark CLI defaults to 5 and the lowest
+/// meaningful value is 1 (single-shot best-of-1).
+pub fn run_handrolled_baseline_timings(
+    parquet_path: &Path,
+    query: &[f32],
+    threshold: f32,
+    iterations: usize,
+) -> Result<HandrolledBaselineResult> {
+    assert!(
+        iterations > 0,
+        "run_handrolled_baseline_timings requires iterations >= 1"
+    );
+
+    let mut decompress = Duration::MAX;
+    let mut cosine = Duration::MAX;
+    let mut filter = Duration::MAX;
+    let mut last_scores: Vec<f32> = Vec::new();
+
+    for _ in 0..iterations {
+        let start = Instant::now();
+        let data = read_parquet_embedding_column(parquet_path)?;
+        decompress = decompress.min(start.elapsed());
+
+        let start = Instant::now();
+        let scores = cosine_loop(&data.elements, data.num_rows, data.dim, query);
+        cosine = cosine.min(start.elapsed());
+        debug_assert_eq!(scores.len(), data.num_rows);
+
+        let start = Instant::now();
+        let matches = filter_loop(&scores, threshold);
+        filter = filter.min(start.elapsed());
+        debug_assert_eq!(matches.len(), data.num_rows);
+
+        last_scores = scores;
+    }
+
+    Ok(HandrolledBaselineResult {
+        timings: VariantTimings {
+            decompress,
+            cosine,
+            filter,
+        },
+        last_scores,
+    })
+}
+
+/// Compute cosine similarity for every row against `query`. The query is assumed to match
+/// the database vectors' dimension. Returns one f32 score per row; scores for zero-norm
+/// rows or a zero-norm query are 0.0 by convention.
+pub fn cosine_loop(elements: &[f32], num_rows: usize, dim: usize, query: &[f32]) -> Vec<f32> {
+    assert_eq!(query.len(), dim);
+    assert_eq!(elements.len(), num_rows * dim);
+
+    let query_norm = query.iter().map(|&q| q * q).sum::<f32>().sqrt();
+    let mut out = Vec::with_capacity(num_rows);
+    if query_norm == 0.0 {
+        out.resize(num_rows, 0.0);
+        return out;
+    }
+
+    for row in 0..num_rows {
+        let base = row * dim;
+        let slice = &elements[base..base + dim];
+        let mut dot = 0.0f32;
+        let mut sq = 0.0f32;
+        for i in 0..dim {
+            dot += slice[i] * query[i];
+            sq += slice[i] * slice[i];
+        }
+        let norm = sq.sqrt();
+        if norm == 0.0 {
+            out.push(0.0);
+        } else {
+            out.push(dot / (norm * query_norm));
+        }
+    }
+    out
+}
+
+/// Build the `cosine > threshold` boolean mask — **strict greater-than**, matching the
+/// Vortex-side path which uses `Operator::Gt` in
+/// [`vortex_tensor::vector_search::build_similarity_search_tree`]. Keep these two in
+/// sync: if one changes the comparison semantics, the correctness-verification pass will
+/// start reporting a mismatch for the lossless variants.
+pub fn filter_loop(scores: &[f32], threshold: f32) -> Vec<bool> {
+    scores.iter().map(|&s| s > threshold).collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use std::fs::File;
+    use std::sync::Arc;
+
+    use arrow_array::RecordBatch;
+    use arrow_array::builder::FixedSizeListBuilder;
+    use arrow_array::builder::Float32Builder;
+    use arrow_schema::DataType;
+    use arrow_schema::Field;
+    use arrow_schema::Schema;
+    use parquet::arrow::ArrowWriter;
+    use tempfile::NamedTempFile;
+
+    use super::*;
+
+    /// Build a minimal parquet file with an `emb: FixedSizeList<f32, dim>` column and
+    /// verify the baseline pipeline produces the expected scores.
+    fn write_tiny_fsl_parquet(dim: i32, rows: &[&[f32]]) -> Result<NamedTempFile> {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "emb",
+            DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), dim),
+            false,
+        )]));
+
+        let file = NamedTempFile::new()?;
+        let mut writer =
+            ArrowWriter::try_new(File::create(file.path())?, Arc::clone(&schema), None)?;
+
+        let dim_usize = usize::try_from(dim).unwrap();
+        let mut builder = FixedSizeListBuilder::new(Float32Builder::new(), dim);
+        for row in rows {
+            assert_eq!(row.len(), dim_usize);
+            for &v in row.iter() {
+                builder.values().append_value(v);
+            }
+            builder.append(true);
+        }
+        let array = builder.finish();
+        let batch = RecordBatch::try_new(schema, vec![Arc::new(array)])?;
+        writer.write(&batch)?;
+        writer.close()?;
+        Ok(file)
+    }
+
+    #[test]
+    fn handrolled_baseline_reads_fsl_column() {
+        let file =
+            write_tiny_fsl_parquet(3, &[&[1.0, 0.0, 0.0], &[0.0, 1.0, 0.0], &[1.0, 0.0, 0.0]])
+                .unwrap();
+
+        let data = read_parquet_embedding_column(file.path()).unwrap();
+        assert_eq!(data.dim, 3);
+        assert_eq!(data.num_rows, 3);
+        assert_eq!(data.elements.len(), 9);
+
+        let query = [1.0f32, 0.0, 0.0];
+        let scores = cosine_loop(&data.elements, data.num_rows, data.dim, &query);
+        assert_eq!(scores, vec![1.0, 0.0, 1.0]);
+
+        let mask = filter_loop(&scores, 0.5);
+        assert_eq!(mask, vec![true, false, true]);
+    }
+
+    #[test]
+    fn run_handrolled_baseline_timings_returns_last_iteration_scores() {
+        // Verifies the new `last_scores` contract: the timing loop returns the
+        // cosine scores from the final iteration, and those scores match what we'd
+        // get from a one-shot `cosine_loop` on the same data. Callers of
+        // `run_handrolled_baseline_timings` rely on this for verification (so they
+        // don't need a second parquet read to compute ground-truth scores).
+        let file =
+            write_tiny_fsl_parquet(3, &[&[1.0, 0.0, 0.0], &[0.0, 1.0, 0.0], &[1.0, 0.0, 0.0]])
+                .unwrap();
+        let query = [1.0f32, 0.0, 0.0];
+
+        let result = run_handrolled_baseline_timings(file.path(), &query, 0.5, 3).unwrap();
+
+        // Deterministic expected scores: rows 0 and 2 match the query exactly,
+        // row 1 is orthogonal.
+        assert_eq!(result.last_scores, vec![1.0, 0.0, 1.0]);
+        assert!(result.timings.decompress > Duration::ZERO);
+        assert!(result.timings.cosine > Duration::ZERO);
+        assert!(result.timings.filter > Duration::ZERO);
+    }
+
+    #[test]
+    #[should_panic(expected = "iterations >= 1")]
+    fn run_handrolled_baseline_timings_panics_on_zero_iterations() {
+        let file =
+            write_tiny_fsl_parquet(3, &[&[1.0, 0.0, 0.0], &[0.0, 1.0, 0.0], &[1.0, 0.0, 0.0]])
+                .unwrap();
+        let query = [1.0f32, 0.0, 0.0];
+        let _result = run_handrolled_baseline_timings(file.path(), &query, 0.5, 0);
+    }
+}
diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs
new file mode 100644
index 00000000000..4ae3fce26e2
--- /dev/null
+++ b/benchmarks/vector-search-bench/src/lib.rs
@@ -0,0 +1,663 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Vector similarity-search benchmark core.
+//!
+//! For each `(dataset, variant)` pair we report:
+//!
+//! - **In-memory size** — `ArrayRef::nbytes()` of the prepared variant tree. This is the
+//!   memory footprint you'd pay to keep that encoding resident.
+//! - **Compress time** — the wall time to build the variant tree from the materialized
+//!   uncompressed source (0 for the uncompressed variant itself, the BtrBlocks pass for
+//!   `vortex-default`, the full L2Denorm+SORF+quantize pipeline for `vortex-turboquant`).
+//! - **Decompress time** — the wall time to execute the variant tree back into a
+//!   canonical `FixedSizeListArray` (≈0 for the already-canonical uncompressed variant,
+//!   meaningful for the compressed variants).
+//! - **Cosine time** — executing `CosineSimilarity(data, const_query)` to a materialized
+//!   f32 primitive array.
+//! - **Filter time** — executing `Binary(Gt, [cosine, threshold])` to a `BoolArray`.
+//! - **Recall@10** (for the lossy TurboQuant variant only) against exact top-10 from the
+//!   uncompressed variant.
+//!
+//! Before any timing begins, the benchmark also runs a **correctness verification** pass
+//! via [`verify`]: for every variant it computes cosine scores for a single query and
+//! compares them to the ground-truth scores from the uncompressed variant. Lossless
+//! variants must match within [`verify::LOSSLESS_TOLERANCE`]; lossy variants must match
+//! within [`verify::LOSSY_TOLERANCE`]. A correctness failure bails the run.
+//!
+//! Measurements are emitted via the existing `vortex_bench::measurements` types so
+//! results flow through the standard `gh-json` pipeline and show up on the CI dashboard
+//! alongside compress-bench / random-access-bench.
+
+use std::time::Duration;
+use std::time::Instant;
+
+pub mod handrolled_baseline;
+pub mod recall;
+pub mod verify;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::bail;
+use clap::ValueEnum;
+use vortex::array::ArrayRef;
+use vortex::array::ExecutionCtx;
+use vortex::array::IntoArray;
+use vortex::array::VortexSessionExecute;
+use vortex::array::arrays::BoolArray;
+use vortex::array::arrays::Chunked;
+use vortex::array::arrays::ChunkedArray;
+use vortex::array::arrays::Extension;
+use vortex::array::arrays::ExtensionArray;
+use vortex::array::arrays::FixedSizeListArray;
+use vortex::array::arrays::PrimitiveArray;
+use vortex::array::arrays::Struct;
+use vortex::array::arrays::chunked::ChunkedArrayExt;
+use vortex::array::arrays::extension::ExtensionArrayExt;
+use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt;
+use vortex::array::arrays::struct_::StructArrayExt as _;
+use vortex::dtype::DType;
+use vortex::dtype::PType;
+use vortex::error::vortex_panic;
+use vortex_bench::Format;
+use vortex_bench::SESSION;
+use vortex_bench::conversions::list_to_vector_ext;
+use vortex_bench::conversions::parquet_to_vortex_chunks;
+use vortex_bench::datasets::Dataset;
+use vortex_bench::vector_dataset::VectorDataset;
+use vortex_btrblocks::BtrBlocksCompressor;
+use vortex_tensor::scalar_fns::cosine_similarity::CosineSimilarity;
+use vortex_tensor::vector_search::build_constant_query_vector;
+use vortex_tensor::vector_search::build_similarity_search_tree;
+use vortex_tensor::vector_search::compress_turboquant;
+
+/// The threshold used when wrapping the similarity expression in a
+/// `Binary(Gt, [cosine, threshold])` filter. Set to a value high enough that random pairs
+/// from a ~1.0-norm distribution reject but self-query pairs match.
+pub const DEFAULT_THRESHOLD: f32 = 0.8;
+
+/// Row index used to pick a query vector from the dataset. Using a fixed row keeps queries
+/// reproducible across runs and guarantees at least one match (since `cosine(x, x) == 1.0`).
+pub const DEFAULT_QUERY_ROW: usize = 0;
+
+/// A single data-preparation strategy that the benchmark exercises.
+///
+/// Each variant corresponds to one column on the "format" axis in downstream dashboards. The
+/// `Format` mapping is what gets serialized into the `target.format` field of gh-json
+/// output.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)]
+pub enum Variant {
+    /// Raw `Vector<dim, f32>` with no encoding-level compression applied.
+    #[clap(name = "vortex-uncompressed")]
+    VortexUncompressed,
+    /// `BtrBlocksCompressor::default()` walks into the `Vector` extension and recursively
+    /// compresses the FSL storage child. This is the "generic lossless" Vortex story for
+    /// float vectors.
+    #[clap(name = "vortex-default")]
+    VortexDefault,
+    /// The full TurboQuant pipeline: `L2Denorm(SorfTransform(FSL(Dict)))`. Lossy; dramatic
+    /// size win; requires reporting recall alongside throughput for the comparison to be
+    /// honest. See [`vortex_tensor::vector_search::compress_turboquant`].
+    #[clap(name = "vortex-turboquant")]
+    VortexTurboQuant,
+}
+
+impl Variant {
+    /// The Format enum value this variant reports itself as in emitted measurements.
+    /// Uncompressed and BtrBlocks-default both surface as [`Format::OnDiskVortex`]; the
+    /// TurboQuant variant surfaces as [`Format::VortexLossy`] — the general
+    /// file-format bucket for any `.vortex` file that contains lossy encodings —
+    /// so dashboards can distinguish lossy runs from the lossless baseline.
+    pub fn as_format(&self) -> Format {
+        match self {
+            Variant::VortexUncompressed => Format::OnDiskVortex,
+            Variant::VortexDefault => Format::OnDiskVortex,
+            Variant::VortexTurboQuant => Format::VortexLossy,
+        }
+    }
+
+    /// A stable, kebab-cased label used in metric names so dashboards can split apart
+    /// variants that map to the same Format.
+    pub fn label(&self) -> &'static str {
+        match self {
+            Variant::VortexUncompressed => "vortex-uncompressed",
+            Variant::VortexDefault => "vortex-default",
+            Variant::VortexTurboQuant => "vortex-turboquant",
+        }
+    }
+}
+
+/// The ingested form of a dataset, ready to be fed to [`prepare_variant`] and the
+/// timing/verification pipeline.
+pub struct PreparedDataset {
+    /// Name used in metric strings — usually the dataset's `Dataset::name()`.
+    pub name: String,
+    /// Uncompressed `Vector<dim, f32>` array (canonical form). Doubles as the
+    /// ground-truth basis for the correctness-verification pass and for TurboQuant's
+    /// Recall@K quality measurement.
+    pub uncompressed: ArrayRef,
+    /// The query vector to use (a single row pulled from the dataset).
+    pub query: Vec<f32>,
+    /// Parquet file size on disk in bytes — produced by the dataset download step
+    /// and reused as the "handrolled size" measurement in main.rs.
+    pub parquet_bytes: u64,
+}
+
+impl PreparedDataset {
+    /// Dimension of the underlying vector column.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `self.uncompressed` is not an `Extension<FixedSizeList<_, dim, _>>` —
+    /// which should be impossible because [`prepare_dataset`] is the only constructor
+    /// and it guarantees this shape.
+    pub fn dim(&self) -> u32 {
+        let fsl_dtype = match self.uncompressed.dtype() {
+            DType::Extension(ext) => ext.storage_dtype(),
+            other => vortex_panic!("expected Extension<Vector>, got {other}"),
+        };
+        match fsl_dtype {
+            DType::FixedSizeList(_, dim, _) => *dim,
+            other => vortex_panic!("expected FixedSizeList storage, got {other}"),
+        }
+    }
+
+    /// Number of rows in the uncompressed dataset.
+    pub fn num_rows(&self) -> usize {
+        self.uncompressed.len()
+    }
+}
+
+/// Prepare a dataset by downloading its parquet file, converting the `emb` column to a
+/// `Vector<dim, f32>` extension array, and extracting a single-row query vector.
+pub async fn prepare_dataset(dataset: &VectorDataset) -> Result<PreparedDataset> {
+    let parquet_path = dataset
+        .to_parquet_path()
+        .await
+        .context("download vector dataset parquet")?;
+    let parquet_bytes = std::fs::metadata(&parquet_path)
+        .with_context(|| format!("stat parquet file {:?}", parquet_path))?
+        .len();
+
+    tracing::info!(
+        "ingesting {} parquet from {:?} ({} bytes)",
+        dataset.name(),
+        parquet_path,
+        parquet_bytes
+    );
+
+    let chunked = parquet_to_vortex_chunks(parquet_path).await?;
+
+    let struct_array = chunked.into_array();
+    let emb_column = extract_emb_column(&struct_array)?;
+    let wrapped = list_to_vector_ext(emb_column)?;
+
+    // `list_to_vector_ext` may return a chunked `Extension<Vector>` when the source was
+    // a `ChunkedArray` of list columns (the usual shape after `parquet_to_vortex_chunks`).
+    // Materialize it into a single non-chunked `ExtensionArray` so downstream code can
+    // treat it uniformly.
+    let mut ctx = SESSION.create_execution_ctx();
+    let uncompressed = wrapped.execute::<ExtensionArray>(&mut ctx)?.into_array();
+
+    let query = extract_query_row(&uncompressed, DEFAULT_QUERY_ROW)?;
+
+    Ok(PreparedDataset {
+        name: dataset.name().to_string(),
+        uncompressed,
+        query,
+        parquet_bytes,
+    })
+}
+
+/// Project the `emb` column out of a chunked struct array. This rebuilds a chunked list
+/// array with just that one column.
+fn extract_emb_column(struct_array: &ArrayRef) -> Result<ArrayRef> {
+    if let Some(chunked) = struct_array.as_opt::<Chunked>() {
+        let mut emb_chunks: Vec<ArrayRef> = Vec::with_capacity(chunked.nchunks());
+        for chunk in chunked.iter_chunks() {
+            emb_chunks.push(extract_emb_column(chunk)?);
+        }
+        if emb_chunks.is_empty() {
+            bail!("dataset has no chunks");
+        }
+        return Ok(ChunkedArray::from_iter(emb_chunks).into_array());
+    }
+
+    let Some(struct_view) = struct_array.as_opt::<Struct>() else {
+        bail!(
+            "expected dataset chunks to be Struct arrays, got {}",
+            struct_array.dtype()
+        );
+    };
+
+    let field = struct_view
+        .unmasked_field_by_name("emb")
+        .context("dataset parquet must have an `emb` column")?;
+    Ok(field.clone())
+}
+
+/// Pull a single row out of a `Vector<dim, f32>` extension array as a plain `Vec<f32>`.
+///
+/// Only `f32`-typed `Vector` arrays are supported today — the benchmark deliberately
+/// restricts itself to `f32` vectors, so we assert the element type rather than
+/// quietly returning a mis-cast slice.
+pub(crate) fn extract_query_row(vector_ext: &ArrayRef, row: usize) -> Result<Vec<f32>> {
+    if row >= vector_ext.len() {
+        bail!(
+            "query row {row} out of bounds for dataset of length {}",
+            vector_ext.len()
+        );
+    }
+
+    let ext_view = vector_ext
+        .as_opt::<Extension>()
+        .context("prepared dataset must be a Vector extension array")?;
+
+    let mut ctx = SESSION.create_execution_ctx();
+
+    // Execute storage array to its canonical FSL form.
+    let fsl: FixedSizeListArray = ext_view.storage_array().clone().execute(&mut ctx)?;
+
+    let dim_usize = match fsl.dtype() {
+        DType::FixedSizeList(_, d, _) => *d as usize,
+        other => bail!("storage dtype must be FixedSizeList, got {other}"),
+    };
+
+    let elements: PrimitiveArray = fsl.elements().clone().execute(&mut ctx)?;
+    if elements.ptype() != PType::F32 {
+        bail!(
+            "extract_query_row currently only supports f32 Vector columns, got {:?}",
+            elements.ptype()
+        );
+    }
+    let slice = elements.as_slice::<f32>();
+    let start = row * dim_usize;
+    Ok(slice[start..start + dim_usize].to_vec())
+}
+
+/// A prepared variant: the in-memory array tree plus the metadata we want to report
+/// alongside it (size and construction cost).
+#[derive(Debug, Clone)]
+pub struct PreparedVariant {
+    /// The variant's in-memory array tree. For the uncompressed variant this is the same
+    /// canonical `Extension<Vector>` pulled out of `prepare_dataset`; for the others it's
+    /// the output of the respective compression pipeline.
+    pub array: ArrayRef,
+    /// Summed byte footprint of the variant tree — `ArrayRef::nbytes()`. This is the
+    /// in-memory cost of keeping the variant resident, not a disk size.
+    pub nbytes: u64,
+    /// Wall time spent constructing the variant tree from the already-materialized
+    /// uncompressed source. 0 for [`Variant::VortexUncompressed`]; meaningful for the
+    /// compressed variants.
+    pub compress_duration: Duration,
+}
+
+/// Apply a `Variant`'s preparation strategy to the materialized uncompressed source and
+/// return the resulting tree together with its reported in-memory size and construction
+/// time. Uses the global [`vortex_bench::SESSION`] for any execution-context work; the
+/// benchmark has no reason to support multiple concurrent sessions.
+///
+/// **Why nbytes instead of on-disk size?** The Vortex file writer applies BtrBlocks
+/// compression as part of its default write strategy regardless of the in-memory tree
+/// shape, so serializing an "uncompressed" tree and measuring the resulting `.vortex`
+/// file produces the same bytes as serializing a `BtrBlocksCompressor::default()`-
+/// compressed tree — the disk-size comparison collapses two conceptually different
+/// things into one number. Reporting `nbytes()` of the in-memory tree keeps the size
+/// measurement consistent with what the *compute* measurements operate on.
+///
+/// **Forward migration to disk-backed runs.** Today this benchmark keeps each
+/// variant in memory because the TurboQuant pipeline's `L2Denorm` and
+/// `SorfTransform` scalar functions do not yet implement `ScalarFnVTable::serialize`,
+/// so writing a TurboQuant variant to a `.vortex` file and reading it back is not
+/// round-trippable. Once those serialize impls land, this benchmark can switch to
+/// disk-backed runs additively — there is no lossy-variant-specific code to unwind
+/// here, because the `Format::VortexLossy` bucket was deliberately kept generic.
+/// Variants continue to report their `Format` via `as_format()` exactly as they do
+/// today; only the path from `PreparedVariant` to execution gains an optional
+/// write/read hop.
+pub fn prepare_variant(prepared: &PreparedDataset, variant: Variant) -> Result<PreparedVariant> {
+    match variant {
+        Variant::VortexUncompressed => {
+            // Identity: the uncompressed Extension<Vector> is already materialized. Still
+            // record a dummy Instant so the timing point has a well-defined value even
+            // if it's effectively zero.
+            let start = Instant::now();
+            let array = prepared.uncompressed.clone();
+            let compress_duration = start.elapsed();
+            let nbytes = array.nbytes();
+            Ok(PreparedVariant {
+                array,
+                nbytes,
+                compress_duration,
+            })
+        }
+        Variant::VortexDefault => {
+            let start = Instant::now();
+            let array = BtrBlocksCompressor::default().compress(&prepared.uncompressed)?;
+            let compress_duration = start.elapsed();
+            let nbytes = array.nbytes();
+            Ok(PreparedVariant {
+                array,
+                nbytes,
+                compress_duration,
+            })
+        }
+        Variant::VortexTurboQuant => {
+            let mut ctx = SESSION.create_execution_ctx();
+            let start = Instant::now();
+            let array = compress_turboquant(prepared.uncompressed.clone(), &mut ctx)?;
+            let compress_duration = start.elapsed();
+            let nbytes = array.nbytes();
+            Ok(PreparedVariant {
+                array,
+                nbytes,
+                compress_duration,
+            })
+        }
+    }
+}
+
+/// Run the decompress / cosine / filter microbenchmarks against a prepared variant
+/// array and return the best-of-`iterations` wall times for each measurement.
+///
+/// The three stages are **interleaved** inside a single outer loop rather than run
+/// as three separate back-to-back loops. Interleaving keeps each stage's cache /
+/// branch-predictor / allocator state symmetric across iterations — a pathology of
+/// the back-to-back shape is that iteration `N+1` of the cosine stage runs on
+/// warmed caches left behind by iteration `N` of the cosine stage, while iteration
+/// `N+1` of the filter stage runs on caches left behind by the *cosine* stage. The
+/// interleaved form makes each stage see roughly the same cache state every
+/// iteration.
+///
+/// Each stage still gets a fresh `ExecutionCtx` (from the global
+/// [`vortex_bench::SESSION`]), so no cached scalar-fn state leaks between stages
+/// within a single iteration.
+pub fn run_timings(
+    variant_array: &ArrayRef,
+    query: &[f32],
+    iterations: usize,
+) -> Result<VariantTimings> {
+    let mut decompress = Duration::MAX;
+    let mut cosine = Duration::MAX;
+    let mut filter = Duration::MAX;
+
+    for _ in 0..iterations {
+        {
+            let mut ctx = SESSION.create_execution_ctx();
+            let start = Instant::now();
+            let decoded: FixedSizeListArray = decompress_full_scan(variant_array, &mut ctx)?;
+            decompress = decompress.min(start.elapsed());
+            drop(decoded);
+        }
+        {
+            let mut ctx = SESSION.create_execution_ctx();
+            let start = Instant::now();
+            let scores: PrimitiveArray = execute_cosine(variant_array, query, &mut ctx)?;
+            cosine = cosine.min(start.elapsed());
+            drop(scores);
+        }
+        {
+            let mut ctx = SESSION.create_execution_ctx();
+            let start = Instant::now();
+            let matches: BoolArray =
+                execute_filter(variant_array, query, DEFAULT_THRESHOLD, &mut ctx)?;
+            filter = filter.min(start.elapsed());
+            drop(matches);
+        }
+    }
+
+    Ok(VariantTimings {
+        decompress,
+        cosine,
+        filter,
+    })
+}
+
+/// Timing summary for one `(dataset, variant)` pair.
+#[derive(Debug, Clone, Copy)]
+pub struct VariantTimings {
+    /// Wall time to execute the variant's array tree back into a canonical
+    /// `FixedSizeListArray`. ~0 for [`Variant::VortexUncompressed`] (the tree is already
+    /// canonical), meaningful for the two compressed variants.
+    pub decompress: Duration,
+    /// Wall time for the cosine_similarity scalar-function execution over the whole
+    /// column (materialized into an `f32` [`PrimitiveArray`]).
+    pub cosine: Duration,
+    /// Wall time for the full `Binary(Gt, [cosine, threshold])` expression executed
+    /// into a [`BoolArray`].
+    pub filter: Duration,
+}
+
+/// Fully materialize the input column so the measurement captures *all* decompression
+/// work — the extension shell, the FSL storage, **and the inner f32 element buffer**.
+///
+/// Forcing the element buffer to materialize as a canonical `PrimitiveArray<f32>` is
+/// what distinguishes this from a no-op cache hit. Executing the `ExtensionArray` or
+/// `FixedSizeListArray` alone only unwraps the container shells — if the FSL's
+/// `elements` child is (e.g.) an `alprd` tree, the bit-unpacking is lazy and only
+/// happens when something reads the values. The `execute::<PrimitiveArray>` call below
+/// forces that read.
+///
+/// For the Vortex-uncompressed variant this is cheap (bitwise copy / no-op). For
+/// BtrBlocks-default it includes the ALP-RD decoding pass. For TurboQuant it includes
+/// running the inverse SORF rotation + dictionary lookup through the scalar-fn
+/// pipeline.
+pub fn decompress_full_scan(
+    array: &ArrayRef,
+    ctx: &mut ExecutionCtx,
+) -> Result<FixedSizeListArray> {
+    let ext: ExtensionArray = array.clone().execute(ctx)?;
+    let fsl: FixedSizeListArray = ext.storage_array().clone().execute(ctx)?;
+    // Force the element buffer all the way down to a canonical PrimitiveArray so the
+    // timing captures any lazy decode work (ALP-RD bit unpacking, dict lookups, SORF
+    // inverse rotation, etc.).
+    let elements: PrimitiveArray = fsl.elements().clone().execute(ctx)?;
+    drop(elements);
+    Ok(fsl)
+}
+
+/// Execute `CosineSimilarity(data, broadcast(query))` to a materialized `f32`
+/// [`PrimitiveArray`]. Shared between the timing loop and the correctness-verification
+/// path so both exercise the exact same expression tree.
+///
+/// # Errors
+///
+/// Returns an error if `data` is not a [`vortex_tensor::vector::Vector`] extension array,
+/// if `query`'s length doesn't match the database vector dimension, or if the execution
+/// context rejects the expression.
+pub fn execute_cosine(
+    data: &ArrayRef,
+    query: &[f32],
+    ctx: &mut ExecutionCtx,
+) -> Result<PrimitiveArray> {
+    let num_rows = data.len();
+    let query_vec = build_constant_query_vector(query, num_rows)?;
+    let cosine = CosineSimilarity::try_new_array(data.clone(), query_vec, num_rows)?.into_array();
+    Ok(cosine.execute(ctx)?)
+}
+
+fn execute_filter(
+    data: &ArrayRef,
+    query: &[f32],
+    threshold: f32,
+    ctx: &mut ExecutionCtx,
+) -> Result<BoolArray> {
+    let tree = build_similarity_search_tree(data.clone(), query, threshold)?;
+    Ok(tree.execute(ctx)?)
+}
+
+/// Test-only helpers shared between the unit tests in this crate's submodules.
+#[cfg(test)]
+pub(crate) mod test_utils {
+    use vortex::array::ArrayRef;
+    use vortex::array::IntoArray;
+    use vortex::array::arrays::ExtensionArray;
+    use vortex::array::arrays::FixedSizeListArray;
+    use vortex::array::arrays::PrimitiveArray;
+    use vortex::array::extension::EmptyMetadata;
+    use vortex::array::validity::Validity;
+    use vortex::buffer::BufferMut;
+    use vortex::dtype::extension::ExtDType;
+    use vortex_tensor::vector::Vector;
+
+    /// Build a deterministic `Vector<dim, f32>` extension array of `num_rows` rows for
+    /// tests. The PRNG is a trivial xorshift keyed by `seed`; we don't care about the
+    /// distribution beyond "not all zeros".
+    pub fn synthetic_vector(dim: u32, num_rows: usize, seed: u64) -> ArrayRef {
+        let mut buf = BufferMut::<f32>::with_capacity(num_rows * dim as usize);
+        let mut state = seed;
+        for _ in 0..(num_rows * dim as usize) {
+            state ^= state << 13;
+            state ^= state >> 7;
+            state ^= state << 17;
+            let v = ((state & 0xFFFF) as f32 / 32768.0) - 1.0;
+            buf.push(v);
+        }
+        let elements = PrimitiveArray::new::<f32>(buf.freeze(), Validity::NonNullable).into_array();
+        let fsl =
+            FixedSizeListArray::try_new(elements, dim, Validity::NonNullable, num_rows).unwrap();
+        let ext_dtype = ExtDType::<Vector>::try_new(EmptyMetadata, fsl.dtype().clone())
+            .unwrap()
+            .erased();
+        ExtensionArray::new(ext_dtype, fsl.into_array()).into_array()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::test_utils::synthetic_vector;
+    use super::*;
+
+    /// Build a test `PreparedDataset` from synthetic data, pulling the query from
+    /// row 0 via the shared `extract_query_row` helper so all tests exercise the
+    /// ptype-assertion path the benchmark hot path uses.
+    fn test_prepared(dim: u32, num_rows: usize, seed: u64) -> PreparedDataset {
+        let uncompressed = synthetic_vector(dim, num_rows, seed);
+        let query = extract_query_row(&uncompressed, 0).unwrap();
+        PreparedDataset {
+            name: "synthetic".to_string(),
+            uncompressed,
+            query,
+            parquet_bytes: 0,
+        }
+    }
+
+    #[test]
+    fn extract_query_row_returns_the_right_slice() {
+        let dim = 8u32;
+        let num_rows = 4usize;
+        let prepared = test_prepared(dim, num_rows, 0xDEADBEEF);
+
+        // Row 0 extraction was already used to populate `prepared.query`; check it
+        // agrees with a second extraction for row 0, and that row 3 (last) is
+        // different (as it should be for distinct synthetic vectors).
+        let row0 = extract_query_row(&prepared.uncompressed, 0).unwrap();
+        let row3 = extract_query_row(&prepared.uncompressed, 3).unwrap();
+        assert_eq!(row0, prepared.query);
+        assert_eq!(row0.len(), dim as usize);
+        assert_eq!(row3.len(), dim as usize);
+        assert_ne!(row0, row3, "different rows must differ for this seed");
+    }
+
+    #[test]
+    fn extract_query_row_rejects_out_of_bounds_row() {
+        let dim = 8u32;
+        let num_rows = 4usize;
+        let prepared = test_prepared(dim, num_rows, 0xC0FFEE);
+
+        let err = extract_query_row(&prepared.uncompressed, 4)
+            .unwrap_err()
+            .to_string();
+        assert!(
+            err.contains("query row 4 out of bounds"),
+            "unexpected error: {err}"
+        );
+    }
+
+    #[test]
+    fn prepare_variant_produces_non_empty_array_for_all_variants() {
+        let dim = 128u32;
+        let num_rows = 64usize;
+        let prepared = test_prepared(dim, num_rows, 0xC0FFEE);
+
+        for variant in [
+            Variant::VortexUncompressed,
+            Variant::VortexDefault,
+            Variant::VortexTurboQuant,
+        ] {
+            let prep = prepare_variant(&prepared, variant).unwrap();
+            assert_eq!(
+                prep.array.len(),
+                num_rows,
+                "variant {variant:?} changed row count"
+            );
+            assert!(prep.nbytes > 0, "variant {variant:?} reported zero size");
+
+            let timings = run_timings(&prep.array, &prepared.query, 2).unwrap();
+            // TurboQuant + default must do real work; uncompressed's decompress is a
+            // no-op and can plausibly time as zero.
+            assert!(timings.cosine > Duration::ZERO);
+            assert!(timings.filter > Duration::ZERO);
+        }
+    }
+
+    /// The **uncompressed** variant's decompress pass must be a no-op (the tree is
+    /// already canonical), while TurboQuant must do real work. This is a regression
+    /// guard for a future change accidentally making the uncompressed variant take the
+    /// slow path.
+    #[test]
+    fn uncompressed_decompress_is_fast() {
+        let dim = 128u32;
+        let num_rows = 256usize;
+        let prepared = test_prepared(dim, num_rows, 0xDEADBEEF);
+
+        let uncompressed_prep = prepare_variant(&prepared, Variant::VortexUncompressed).unwrap();
+        let turboquant_prep = prepare_variant(&prepared, Variant::VortexTurboQuant).unwrap();
+
+        let unc_timings = run_timings(&uncompressed_prep.array, &prepared.query, 3).unwrap();
+        let tq_timings = run_timings(&turboquant_prep.array, &prepared.query, 3).unwrap();
+
+        // The uncompressed decompress should be at least an order of magnitude faster
+        // than TurboQuant's (usually many orders of magnitude). 5x is a loose lower
+        // bound that won't flake on a noisy CI runner.
+        assert!(
+            tq_timings.decompress > unc_timings.decompress * 5,
+            "expected TurboQuant decompress ({:?}) to be >5x uncompressed ({:?})",
+            tq_timings.decompress,
+            unc_timings.decompress
+        );
+    }
+
+    /// Diagnostic: print the in-memory tree shape for each variant so we can see
+    /// exactly what BtrBlocks and TurboQuant do to the FSL storage.
+    ///
+    /// Run with:
+    /// ```bash
+    /// cargo test -p vector-search-bench --release -- \
+    ///     --ignored --nocapture print_variant_trees
+    /// ```
+    #[test]
+    #[ignore]
+    #[expect(clippy::use_debug, reason = "human-readable diagnostic output")]
+    fn print_variant_trees() {
+        let dim = 768u32;
+        let num_rows = 500usize;
+        let prepared = test_prepared(dim, num_rows, 0xC0FFEE);
+
+        for variant in [
+            Variant::VortexUncompressed,
+            Variant::VortexDefault,
+            Variant::VortexTurboQuant,
+        ] {
+            let prep = prepare_variant(&prepared, variant).unwrap();
+            println!("=== {variant:?} ===");
+            println!("  len              : {}", prep.array.len());
+            println!("  nbytes           : {}", prep.nbytes);
+            println!("  compress_duration: {:?}", prep.compress_duration);
+            println!(
+                "  encoding tree    : {}",
+                prep.array.display_tree_encodings_only()
+            );
+        }
+    }
+}
diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs
new file mode 100644
index 00000000000..5e78563ef22
--- /dev/null
+++ b/benchmarks/vector-search-bench/src/main.rs
@@ -0,0 +1,415 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! `vector-search-bench` — brute-force cosine-similarity benchmark over public VectorDBBench
+//! embedding corpora.
+//!
+//! Usage:
+//!
+//! ```bash
+//! cargo run -p vector-search-bench --release -- \
+//!     --datasets cohere-small \
+//!     --formats handrolled,vortex-uncompressed,vortex-default,vortex-turboquant \
+//!     --iterations 5 \
+//!     -d table
+//! ```
+//!
+//! The `handrolled` variant is a hand-rolled Rust scalar cosine loop over a flat
+//! `Vec<f32>` decoded from the dataset's canonical parquet file; it is a compute-cost
+//! floor, not a realistic parquet-on-DBMS baseline. See
+//! [`handrolled_baseline`](vector_search_bench::handrolled_baseline) for details.
+
+use std::borrow::Cow;
+use std::io::Write;
+use std::path::PathBuf;
+
+use anyhow::Context;
+use anyhow::Result;
+use clap::Parser;
+use indicatif::ProgressBar;
+use vector_search_bench::DEFAULT_THRESHOLD;
+use vector_search_bench::Variant;
+use vector_search_bench::handrolled_baseline::run_handrolled_baseline_timings;
+use vector_search_bench::prepare_dataset;
+use vector_search_bench::prepare_variant;
+use vector_search_bench::recall::DEFAULT_TOP_K;
+use vector_search_bench::recall::measure_recall_at_k;
+use vector_search_bench::run_timings;
+use vector_search_bench::verify::VerificationKind;
+use vector_search_bench::verify::compute_cosine_scores;
+use vector_search_bench::verify::verify_and_report_scores;
+use vector_search_bench::verify::verify_variant;
+use vortex_bench::Format;
+use vortex_bench::create_output_writer;
+use vortex_bench::datasets::Dataset;
+use vortex_bench::display::DisplayFormat;
+use vortex_bench::display::print_measurements_json;
+use vortex_bench::measurements::CompressionTimingMeasurement;
+use vortex_bench::measurements::CustomUnitMeasurement;
+use vortex_bench::setup_logging_and_tracing;
+use vortex_bench::vector_dataset::VectorDataset;
+
+const BENCHMARK_ID: &str = "vector-search";
+
+/// Command-line arguments for `vector-search-bench`.
+#[derive(Parser, Debug)]
+#[command(version, about, long_about = None)]
+struct Args {
+    /// Number of timed iterations per measurement. The reported time is the minimum across
+    /// iterations (matches compress-bench convention).
+    #[arg(short, long, default_value_t = 5)]
+    iterations: usize,
+
+    /// Subset of datasets to run. Defaults to Cohere-small.
+    #[arg(long, value_delimiter = ',', value_enum, default_values_t = vec![SelectableDataset::CohereSmall])]
+    datasets: Vec<SelectableDataset>,
+
+    /// Which benchmark variants to run, using kebab-cased labels. The `--formats` name is
+    /// used (instead of `--variants`) so this benchmark matches the CI invocation
+    /// convention shared across random-access-bench / compress-bench. Accepted values:
+    /// `handrolled`, `vortex-uncompressed`, `vortex-default`, `vortex-turboquant`.
+    /// Defaults to running all four.
+    #[arg(long, value_delimiter = ',', value_enum, default_values_t = vec![SelectableFormat::Handrolled, SelectableFormat::VortexUncompressed, SelectableFormat::VortexDefault, SelectableFormat::VortexTurboQuant])]
+    formats: Vec<SelectableFormat>,
+
+    /// Number of query rows sampled when computing Recall@K for TurboQuant. 0 disables
+    /// the quality measurement entirely (useful for smoke tests).
+    #[arg(long, default_value_t = 100)]
+    recall_queries: usize,
+
+    /// K in Recall@K. Defaults to 10, matching VectorDBBench conventions.
+    #[arg(long, default_value_t = DEFAULT_TOP_K)]
+    recall_k: usize,
+
+    /// Output display format (`table` for humans, `gh-json` for CI ingestion).
+    #[arg(short, long, default_value_t, value_enum)]
+    display_format: DisplayFormat,
+
+    /// If set, write output to this file instead of stdout.
+    #[arg(short, long)]
+    output_path: Option<PathBuf>,
+
+    /// Verbose logging.
+    #[arg(short, long)]
+    verbose: bool,
+
+    /// Enable perfetto tracing output.
+    #[arg(long)]
+    tracing: bool,
+}
+
+#[derive(clap::ValueEnum, Clone, Copy, Debug, PartialEq, Eq)]
+enum SelectableDataset {
+    #[clap(name = "cohere-small")]
+    CohereSmall,
+    #[clap(name = "cohere-medium")]
+    CohereMedium,
+    #[clap(name = "openai-small")]
+    OpenAiSmall,
+    #[clap(name = "openai-medium")]
+    OpenAiMedium,
+    #[clap(name = "bioasq-medium")]
+    BioasqMedium,
+    #[clap(name = "glove-medium")]
+    GloveMedium,
+}
+
+impl SelectableDataset {
+    fn into_dataset(self) -> VectorDataset {
+        match self {
+            SelectableDataset::CohereSmall => VectorDataset::CohereSmall,
+            SelectableDataset::CohereMedium => VectorDataset::CohereMedium,
+            SelectableDataset::OpenAiSmall => VectorDataset::OpenAiSmall,
+            SelectableDataset::OpenAiMedium => VectorDataset::OpenAiMedium,
+            SelectableDataset::BioasqMedium => VectorDataset::BioasqMedium,
+            SelectableDataset::GloveMedium => VectorDataset::GloveMedium,
+        }
+    }
+}
+
+#[derive(clap::ValueEnum, Clone, Copy, Debug, PartialEq, Eq)]
+enum SelectableFormat {
+    /// Hand-rolled Rust scalar cosine loop over a flat `Vec<f32>` decoded from the
+    /// canonical parquet file via `parquet-rs` / `arrow-rs`. Compute-cost floor —
+    /// not a realistic parquet-on-DBMS baseline. See
+    /// [`vector_search_bench::handrolled_baseline`].
+    #[clap(name = "handrolled")]
+    Handrolled,
+    /// Raw `Vector<dim, f32>` with no encoding compression.
+    #[clap(name = "vortex-uncompressed")]
+    VortexUncompressed,
+    /// BtrBlocks default-compression applied to the FSL storage child.
+    #[clap(name = "vortex-default")]
+    VortexDefault,
+    /// Full TurboQuant pipeline (lossy).
+    #[clap(name = "vortex-turboquant")]
+    VortexTurboQuant,
+}
+
+impl SelectableFormat {
+    fn into_variant(self) -> Option<Variant> {
+        match self {
+            SelectableFormat::Handrolled => None,
+            SelectableFormat::VortexUncompressed => Some(Variant::VortexUncompressed),
+            SelectableFormat::VortexDefault => Some(Variant::VortexDefault),
+            SelectableFormat::VortexTurboQuant => Some(Variant::VortexTurboQuant),
+        }
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let args = Args::parse();
+    setup_logging_and_tracing(args.verbose, args.tracing)?;
+
+    let datasets: Vec<VectorDataset> = args
+        .datasets
+        .iter()
+        .copied()
+        .map(SelectableDataset::into_dataset)
+        .collect();
+
+    let run_handrolled_baseline = args.formats.contains(&SelectableFormat::Handrolled);
+    let variants: Vec<Variant> = args
+        .formats
+        .iter()
+        .filter_map(|f| f.into_variant())
+        .collect();
+
+    // One progress unit per inner-loop body: each Vortex variant plus the handrolled
+    // path (when it's enabled) gets exactly one `progress.inc(1)` below. Keep this
+    // count in sync with the number of `progress.inc` sites.
+    let total_work = datasets.len() * (variants.len() + usize::from(run_handrolled_baseline));
+    let progress = ProgressBar::new(total_work as u64);
+
+    let mut timings: Vec<CompressionTimingMeasurement> = Vec::new();
+    let mut sizes: Vec<CustomUnitMeasurement> = Vec::new();
+    let mut recalls: Vec<CustomUnitMeasurement> = Vec::new();
+    let mut verification: Vec<CustomUnitMeasurement> = Vec::new();
+
+    for dataset in &datasets {
+        let prepared = prepare_dataset(dataset).await?;
+        tracing::info!(
+            "prepared {}: dim={}, num_rows={}",
+            prepared.name,
+            prepared.dim(),
+            prepared.num_rows()
+        );
+
+        // Ground-truth cosine scores for the verification query — the scores produced by
+        // the uncompressed Vortex scan. Every other variant (including the hand-rolled
+        // baseline) will be compared against this.
+        let baseline_scores = compute_cosine_scores(&prepared.uncompressed, &prepared.query)
+            .context("compute ground-truth cosine scores for verification")?;
+        tracing::info!(
+            "computed {} ground-truth cosine scores for {}",
+            baseline_scores.len(),
+            prepared.name
+        );
+
+        // Hand-rolled baseline. Emitted as a separate pseudo-variant with label
+        // `handrolled` so it shows up in dashboards next to the Vortex variants. This
+        // is a hand-rolled Rust scalar cosine loop over a flat `Vec<f32>` decoded from
+        // parquet via `parquet-rs`; it must match the Vortex cosine scores within the
+        // lossless tolerance (f32 ULPs) because it's computing the same math on the
+        // same underlying f32 values.
+        //
+        // `target.format` stays `Format::Parquet` because the *storage* side is still
+        // parquet on disk — only the *compute* is hand-rolled. The metric `name` field
+        // carries the `handrolled` label so human readers can tell the compute apart
+        // from, say, a DuckDB `list_cosine_similarity` baseline on the same parquet.
+        //
+        // Timing runs first and returns the cosine scores from its final iteration;
+        // verification then reuses those scores rather than re-reading the parquet
+        // file. `cosine_loop` is deterministic, so the last-iteration scores equal
+        // what a separate pre-timing verification pass would produce — we just save
+        // one parquet read per dataset. If the scores drift from the Vortex baseline,
+        // `verify_and_report_scores` bails here (after the timing already ran, which
+        // is acceptable because the handrolled loop is cheap and we'd rather run it
+        // twice than skip correctness).
+        if run_handrolled_baseline {
+            let parquet_path = dataset.to_parquet_path().await?;
+            let label = "handrolled";
+            let bench_name = format!("{label}/{}", prepared.name);
+
+            let baseline_result = run_handrolled_baseline_timings(
+                &parquet_path,
+                &prepared.query,
+                DEFAULT_THRESHOLD,
+                args.iterations,
+            )?;
+
+            let handrolled_report = verify_and_report_scores(
+                &bench_name,
+                &baseline_result.last_scores,
+                &baseline_scores,
+                VerificationKind::Lossless,
+            )?;
+            tracing::info!(
+                "{} verification (Lossless): max_abs_diff={:.2e}, mean_abs_diff={:.2e}",
+                bench_name,
+                handrolled_report.max_abs_diff,
+                handrolled_report.mean_abs_diff,
+            );
+            verification.push(CustomUnitMeasurement {
+                name: format!("correctness-max-diff/{bench_name}"),
+                format: Format::Parquet,
+                unit: Cow::from("abs-diff"),
+                value: handrolled_report.max_abs_diff,
+            });
+
+            sizes.push(CustomUnitMeasurement {
+                name: format!("{label} size/{}", prepared.name),
+                format: Format::Parquet,
+                unit: Cow::from("bytes"),
+                value: prepared.parquet_bytes as f64,
+            });
+            timings.push(CompressionTimingMeasurement {
+                name: format!("decompress time/{bench_name}"),
+                format: Format::Parquet,
+                time: baseline_result.timings.decompress,
+            });
+            timings.push(CompressionTimingMeasurement {
+                name: format!("cosine-similarity time/{bench_name}"),
+                format: Format::Parquet,
+                time: baseline_result.timings.cosine,
+            });
+            timings.push(CompressionTimingMeasurement {
+                name: format!("cosine-filter time/{bench_name}"),
+                format: Format::Parquet,
+                time: baseline_result.timings.filter,
+            });
+
+            progress.inc(1);
+        }
+
+        for &variant in &variants {
+            let prep = prepare_variant(&prepared, variant)?;
+
+            let variant_label = variant.label();
+            let bench_name = format!("{variant_label}/{}", prepared.name);
+
+            // Correctness verification BEFORE timing. Lossless variants must match
+            // the uncompressed baseline within f32 noise; TurboQuant must stay within
+            // its lossy tolerance. A failure bails the whole run — you cannot publish
+            // throughput numbers for an encoding that returns wrong answers.
+            let kind = if variant == Variant::VortexTurboQuant {
+                VerificationKind::Lossy
+            } else {
+                VerificationKind::Lossless
+            };
+            let report = verify_variant(
+                &bench_name,
+                &prep.array,
+                &prepared.query,
+                &baseline_scores,
+                kind,
+            )?;
+            tracing::info!(
+                "{} verification ({:?}): max_abs_diff={:.2e}, mean_abs_diff={:.2e}",
+                bench_name,
+                kind,
+                report.max_abs_diff,
+                report.mean_abs_diff,
+            );
+            verification.push(CustomUnitMeasurement {
+                name: format!("correctness-max-diff/{bench_name}"),
+                format: variant.as_format(),
+                unit: Cow::from("abs-diff"),
+                value: report.max_abs_diff,
+            });
+
+            // In-memory nbytes — the honest size of the variant tree we're executing.
+            sizes.push(CustomUnitMeasurement {
+                name: format!("{variant_label} nbytes/{}", prepared.name),
+                format: variant.as_format(),
+                unit: Cow::from("bytes"),
+                value: prep.nbytes as f64,
+            });
+
+            // Compress time — the wall time it takes to build the variant tree from
+            // the materialized uncompressed source. For the uncompressed variant
+            // itself this is ~0 (identity), so we still emit it as a measurement for
+            // dashboard consistency.
+            timings.push(CompressionTimingMeasurement {
+                name: format!("compress time/{bench_name}"),
+                format: variant.as_format(),
+                time: prep.compress_duration,
+            });
+
+            let variant_timings = run_timings(&prep.array, &prepared.query, args.iterations)?;
+
+            timings.push(CompressionTimingMeasurement {
+                name: format!("decompress time/{bench_name}"),
+                format: variant.as_format(),
+                time: variant_timings.decompress,
+            });
+            timings.push(CompressionTimingMeasurement {
+                name: format!("cosine-similarity time/{bench_name}"),
+                format: variant.as_format(),
+                time: variant_timings.cosine,
+            });
+            timings.push(CompressionTimingMeasurement {
+                name: format!("cosine-filter time/{bench_name}"),
+                format: variant.as_format(),
+                time: variant_timings.filter,
+            });
+
+            // Recall@K quality measurement for lossy variants only. The lossless
+            // variants are trivially 1.0 by construction (since they agree with the
+            // uncompressed baseline within 1e-4) so we skip them to keep noise down.
+            if args.recall_queries > 0 && variant == Variant::VortexTurboQuant {
+                let recall = measure_recall_at_k(
+                    &prepared.uncompressed,
+                    &prep.array,
+                    args.recall_queries,
+                    args.recall_k,
+                )?;
+                tracing::info!("Recall@{} for {}: {:.4}", args.recall_k, bench_name, recall);
+                recalls.push(CustomUnitMeasurement {
+                    name: format!("recall@{}/{bench_name}", args.recall_k),
+                    format: variant.as_format(),
+                    unit: Cow::from("recall"),
+                    value: recall,
+                });
+            }
+
+            progress.inc(1);
+        }
+    }
+    progress.finish();
+
+    let mut writer = create_output_writer(&args.display_format, args.output_path, BENCHMARK_ID)?;
+    match args.display_format {
+        DisplayFormat::Table => {
+            // Our variants span multiple `Format` values *and* multiple labels that share a
+            // single `Format`, so the existing `render_table` helper (which groups by
+            // `Target`) would collapse them. Emit one line per measurement instead; this is
+            // only used for developer inspection — CI consumes `gh-json` via the arm below.
+            for timing in &timings {
+                writeln!(writer, "{} {} ns", timing.name, timing.time.as_nanos())?;
+            }
+            for size in &sizes {
+                writeln!(writer, "{} {} {}", size.name, size.value, size.unit)?;
+            }
+            for recall in &recalls {
+                writeln!(
+                    writer,
+                    "{} {:.4} {}",
+                    recall.name, recall.value, recall.unit
+                )?;
+            }
+            for check in &verification {
+                writeln!(writer, "{} {:.6e} {}", check.name, check.value, check.unit)?;
+            }
+        }
+        DisplayFormat::GhJson => {
+            print_measurements_json(&mut writer, timings)?;
+            print_measurements_json(&mut writer, sizes)?;
+            print_measurements_json(&mut writer, recalls)?;
+            print_measurements_json(&mut writer, verification)?;
+        }
+    }
+
+    Ok(())
+}
diff --git a/benchmarks/vector-search-bench/src/recall.rs b/benchmarks/vector-search-bench/src/recall.rs
new file mode 100644
index 00000000000..fb1dff4dc8c
--- /dev/null
+++ b/benchmarks/vector-search-bench/src/recall.rs
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Recall@K quality measurement for lossy vector-search variants.
+//!
+//! This module computes the fraction of the true top-K nearest neighbours that a
+//! lossy encoding (today just TurboQuant) recovers, using the uncompressed Vortex
+//! scan as the local ground truth. Recall is averaged over a small number of sampled
+//! query rows.
+//!
+//! This is explicitly a *relative* recall — we compare TurboQuant-retrieved neighbours
+//! against the neighbours that the *same* cosine-similarity expression finds in the
+//! uncompressed scan, not against VectorDBBench's shipped `neighbors.parquet`. Comparing
+//! against external ground truth would require an index (which Vortex doesn't have) and
+//! is structurally out of scope for a file-format benchmark.
+
+use anyhow::Result;
+use vortex::array::ArrayRef;
+use vortex::utils::aliases::hash_set::HashSet;
+
+use crate::extract_query_row;
+use crate::verify::compute_cosine_scores;
+
+/// Size of the neighbour set we compare. 10 is the standard VectorDBBench default.
+pub const DEFAULT_TOP_K: usize = 10;
+
+/// Compute recall@K for the lossy `compressed` variant against the `uncompressed`
+/// ground-truth variant, averaged over `num_queries` sampled query rows. Uses the
+/// global [`vortex_bench::SESSION`] for all executions.
+///
+/// Query selection is deterministic: rows are picked uniformly across the dataset at
+/// `step = uncompressed.len() / num_queries` intervals. This keeps the result stable
+/// across runs and avoids needing to thread a PRNG seed into the benchmark CLI.
+pub fn measure_recall_at_k(
+    uncompressed: &ArrayRef,
+    compressed: &ArrayRef,
+    num_queries: usize,
+    top_k: usize,
+) -> Result<f64> {
+    assert!(
+        num_queries > 0,
+        "measure_recall_at_k requires num_queries > 0"
+    );
+    assert!(top_k > 0, "measure_recall_at_k requires top_k > 0");
+    let num_rows = uncompressed.len();
+    assert_eq!(
+        compressed.len(),
+        num_rows,
+        "uncompressed and compressed arrays must have the same row count"
+    );
+    assert!(num_rows >= top_k, "dataset must have at least top_k rows");
+
+    let step = (num_rows / num_queries).max(1);
+
+    let mut total_hits: usize = 0;
+    let mut total_checked: usize = 0;
+
+    for q in 0..num_queries {
+        let row = (q * step).min(num_rows - 1);
+        let query = extract_query_row(uncompressed, row)?;
+
+        let gt_scores = compute_cosine_scores(uncompressed, &query)?;
+        let truth = top_k_indices(&gt_scores, top_k);
+
+        let lossy_scores = compute_cosine_scores(compressed, &query)?;
+        let lossy = top_k_indices(&lossy_scores, top_k);
+
+        let truth_set: HashSet<usize> = truth.iter().copied().collect();
+        total_hits += lossy.iter().filter(|idx| truth_set.contains(*idx)).count();
+        total_checked += top_k;
+    }
+
+    Ok(total_hits as f64 / total_checked as f64)
+}
+
+/// Return the indices of the top-K highest scores, stable-sorted descending.
+///
+/// Uses `f32::total_cmp` for a NaN-safe total order — `partial_cmp` would panic on
+/// NaN, and `partial_cmp(...).unwrap_or(Ordering::Equal)` would put NaNs at
+/// arbitrary positions. `total_cmp` gives NaNs a well-defined (but meaningless) sort
+/// slot, which lets the function be robust against accidental NaN inputs without
+/// silently hiding them.
+fn top_k_indices(scores: &[f32], top_k: usize) -> Vec<usize> {
+    let mut idx: Vec<usize> = (0..scores.len()).collect();
+    idx.sort_by(|&a, &b| scores[b].total_cmp(&scores[a]));
+    idx.truncate(top_k);
+    idx
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::Variant;
+    use crate::extract_query_row;
+    use crate::prepare_variant;
+    use crate::test_utils::synthetic_vector;
+
+    #[test]
+    fn top_k_indices_handles_nan_without_panicking() {
+        // `partial_cmp` panics on NaN (well, returns None, which was silently swallowed
+        // before). `total_cmp` gives NaN a well-defined slot, so the sort doesn't
+        // panic and doesn't produce arbitrary orderings for non-NaN elements.
+        let scores = [0.9f32, f32::NAN, 0.7, 0.5, f32::NAN];
+        let top = top_k_indices(&scores, 3);
+        assert_eq!(top.len(), 3);
+        // The finite values 0.9, 0.7, 0.5 should still rank in the right order
+        // relative to each other — NaNs sort somewhere, but the finite ordering is
+        // preserved because `total_cmp` is a total order.
+        let finite_positions: Vec<usize> = top
+            .iter()
+            .copied()
+            .filter(|&i| !scores[i].is_nan())
+            .collect();
+        assert!(
+            finite_positions
+                .windows(2)
+                .all(|w| scores[w[0]] >= scores[w[1]]),
+            "finite scores should still be in descending order"
+        );
+    }
+
+    #[test]
+    fn uncompressed_has_perfect_self_recall() {
+        let dim = 128u32;
+        let num_rows = 64usize;
+        let uncompressed = synthetic_vector(dim, num_rows, 0xC0FFEE);
+
+        let recall = measure_recall_at_k(&uncompressed, &uncompressed, 4, 10).unwrap();
+        assert!(
+            (recall - 1.0).abs() < 1e-9,
+            "self-recall must be 1.0, got {recall}"
+        );
+    }
+
+    #[test]
+    fn turboquant_recall_is_reasonable_for_synthetic_data() {
+        let dim = 128u32;
+        let num_rows = 64usize;
+        let uncompressed = synthetic_vector(dim, num_rows, 0xC0FFEE);
+
+        // `measure_recall_at_k` doesn't need the PreparedDataset's `query` field —
+        // it derives queries internally via `extract_query_row` on `uncompressed`.
+        // Construct just enough of a `PreparedDataset` to pass to `prepare_variant`.
+        let prepared = crate::PreparedDataset {
+            name: "synthetic".to_string(),
+            uncompressed: uncompressed.clone(),
+            query: extract_query_row(&uncompressed, 0).unwrap(),
+            parquet_bytes: 0,
+        };
+
+        let tq_prep = prepare_variant(&prepared, Variant::VortexTurboQuant).unwrap();
+
+        // With only 64 random rows, recall@10 won't be 1.0 but it should be well
+        // above chance (10/64 ≈ 0.156). The test asserts a loose lower bound to catch
+        // total regressions without being flaky on distribution noise.
+        let recall = measure_recall_at_k(&uncompressed, &tq_prep.array, 4, 10).unwrap();
+        assert!(
+            recall >= 0.3,
+            "TurboQuant recall@10 on 64×128 synthetic data should be ≥0.3, got {recall}",
+        );
+    }
+}
diff --git a/benchmarks/vector-search-bench/src/verify.rs b/benchmarks/vector-search-bench/src/verify.rs
new file mode 100644
index 00000000000..da2a73f1ea2
--- /dev/null
+++ b/benchmarks/vector-search-bench/src/verify.rs
@@ -0,0 +1,402 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Correctness verification for vector-search variants.
+//!
+//! Before the timing loop runs, we compute cosine-similarity scores for a single query
+//! row against the uncompressed baseline and against each prepared variant, then compare
+//! the two score vectors element-by-element. This catches two distinct classes of bug:
+//!
+//! - A **lossless variant** that disagrees with the uncompressed scan (bug in the
+//!   compression pipeline, or in how we're routing through the scalar-fn dispatch, or in
+//!   the variant-specific decompress path).
+//! - A **lossy variant** (TurboQuant) that drifts further from ground truth than we
+//!   expect from the bit-width and SORF rotation settings (regression in the encoder).
+//!
+//! The same `execute_cosine` function the timing loop uses is also what verification
+//! uses, so the correctness check is validating the *exact* expression tree we're about
+//! to benchmark. Lossless variants must match within [`LOSSLESS_TOLERANCE`]; lossy
+//! variants must match within [`LOSSY_TOLERANCE`]. A hard-stop `Err` return on any
+//! mismatch keeps the benchmark honest — you cannot publish throughput numbers for a
+//! variant that's returning garbage.
+
+use anyhow::Result;
+use anyhow::bail;
+use vortex::array::ArrayRef;
+use vortex::array::VortexSessionExecute;
+use vortex::dtype::PType;
+use vortex_bench::SESSION;
+
+use crate::execute_cosine;
+
+/// Maximum acceptable absolute difference in cosine scores for a *lossless* variant
+/// (uncompressed, BtrBlocks-default). `cosine_similarity` traverses the FSL storage and
+/// reduces with f32 accumulators, so a pure algebraic change of encoding can shift a
+/// score by a few ULPs of f32 precision. `1e-4` is well above that noise floor while
+/// still catching real regressions.
+pub const LOSSLESS_TOLERANCE: f32 = 1e-4;
+
+/// Maximum acceptable absolute difference in cosine scores for the *lossy* TurboQuant
+/// variant. At the default 8-bit configuration the reconstructed dot product typically
+/// drifts by well under 0.05 for unit-normalized vectors. `0.2` is a loose upper bound
+/// that catches regressions without flaking on distribution-specific noise.
+pub const LOSSY_TOLERANCE: f32 = 0.2;
+
+/// How lossy a variant is allowed to be when its scores are compared to the
+/// uncompressed baseline.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum VerificationKind {
+    /// Lossless variants must match within [`LOSSLESS_TOLERANCE`].
+    Lossless,
+    /// Lossy variants must match within [`LOSSY_TOLERANCE`].
+    Lossy,
+}
+
+/// Per-variant correctness report. Captured for both pass and fail outcomes so the
+/// caller can emit the numbers as dashboard measurements regardless.
+#[derive(Debug, Clone, Copy)]
+pub struct VerificationReport {
+    /// Number of rows compared (== dataset row count).
+    pub num_scores: usize,
+    /// Mean absolute difference between baseline and variant cosine scores.
+    pub mean_abs_diff: f64,
+    /// Max absolute difference between baseline and variant cosine scores.
+    pub max_abs_diff: f64,
+    /// Which tolerance band applied.
+    pub kind: VerificationKind,
+    /// Whether the variant's max-abs-diff stayed within its tolerance.
+    pub passed: bool,
+}
+
+impl VerificationReport {
+    /// The tolerance that was applied to produce [`Self::passed`].
+    pub fn tolerance(&self) -> f32 {
+        match self.kind {
+            VerificationKind::Lossless => LOSSLESS_TOLERANCE,
+            VerificationKind::Lossy => LOSSY_TOLERANCE,
+        }
+    }
+}
+
+/// Compute cosine-similarity scores for a single query row on `data` and return them
+/// as a plain `Vec<f32>`. This is a convenience wrapper around
+/// [`crate::execute_cosine`] that pulls the f32 slice out of the resulting
+/// `PrimitiveArray`. Uses the global [`vortex_bench::SESSION`].
+///
+/// # Errors
+///
+/// Returns an error if [`execute_cosine`] fails (bad input shape or dispatch error),
+/// or if the cosine expression produces a non-`f32` primitive array. The latter can't
+/// happen today because the benchmark only wires `f32` `Vector` columns, but the
+/// explicit ptype check keeps the function sound if the scalar-fn output type ever
+/// widens (e.g. to `f64`) without the caller noticing.
+pub fn compute_cosine_scores(data: &ArrayRef, query: &[f32]) -> Result<Vec<f32>> {
+    let mut ctx = SESSION.create_execution_ctx();
+    let scores = execute_cosine(data, query, &mut ctx)?;
+    if scores.ptype() != PType::F32 {
+        bail!(
+            "compute_cosine_scores: cosine output must be f32, got {:?}",
+            scores.ptype()
+        );
+    }
+    Ok(scores.as_slice::<f32>().to_vec())
+}
+
+/// Compare two equal-length score vectors and return their mean absolute difference
+/// and max absolute difference, without evaluating a pass/fail threshold.
+pub fn compare_scores(baseline: &[f32], other: &[f32]) -> (f64, f64) {
+    assert_eq!(
+        baseline.len(),
+        other.len(),
+        "compare_scores: length mismatch baseline={} other={}",
+        baseline.len(),
+        other.len(),
+    );
+
+    if baseline.is_empty() {
+        return (0.0, 0.0);
+    }
+
+    let mut sum = 0.0f64;
+    let mut max: f64 = 0.0;
+    for (&b, &o) in baseline.iter().zip(other.iter()) {
+        // Treat (+0, -0) pairs as equal and propagate NaN as inf so it always fails
+        // the tolerance check below.
+        let diff = if b.is_nan() || o.is_nan() {
+            f64::INFINITY
+        } else {
+            (f64::from(b) - f64::from(o)).abs()
+        };
+        sum += diff;
+        if diff > max {
+            max = diff;
+        }
+    }
+    (sum / baseline.len() as f64, max)
+}
+
+/// Verify one variant's scores against a baseline and produce a full
+/// [`VerificationReport`]. Whether `passed` is true depends on `kind`'s tolerance.
+pub fn verify_scores(
+    baseline: &[f32],
+    variant_scores: &[f32],
+    kind: VerificationKind,
+) -> VerificationReport {
+    let (mean_abs_diff, max_abs_diff) = compare_scores(baseline, variant_scores);
+    let tolerance = match kind {
+        VerificationKind::Lossless => f64::from(LOSSLESS_TOLERANCE),
+        VerificationKind::Lossy => f64::from(LOSSY_TOLERANCE),
+    };
+    let passed = max_abs_diff <= tolerance;
+    VerificationReport {
+        num_scores: baseline.len(),
+        mean_abs_diff,
+        max_abs_diff,
+        kind,
+        passed,
+    }
+}
+
+/// Verify pre-computed scores against a baseline and enforce the tolerance band.
+///
+/// Takes already-materialized `variant_scores` (as a `&[f32]`) rather than an
+/// `ArrayRef`, so both the Vortex-variant path (which computes scores via
+/// [`execute_cosine`](crate::execute_cosine)) and the hand-rolled baseline path (which
+/// runs a plain Rust loop over a flat `Vec<f32>`) share the same error-handling,
+/// logging, and hard-fail logic without duplicating it in `main.rs`.
+///
+/// Lossless mismatches bail the run with an error; lossy mismatches log a warning
+/// but let the run continue so the recall measurement is still reported.
+pub fn verify_and_report_scores(
+    variant_name: &str,
+    variant_scores: &[f32],
+    baseline_scores: &[f32],
+    kind: VerificationKind,
+) -> Result<VerificationReport> {
+    let report = verify_scores(baseline_scores, variant_scores, kind);
+
+    if !report.passed {
+        let message = format!(
+            "{variant_name} correctness check failed: max_abs_diff={:.6}, \
+             mean_abs_diff={:.6}, tolerance={:.6} ({:?})",
+            report.max_abs_diff,
+            report.mean_abs_diff,
+            report.tolerance(),
+            report.kind,
+        );
+        match kind {
+            VerificationKind::Lossless => bail!("{message}"),
+            VerificationKind::Lossy => {
+                tracing::warn!("{message}");
+            }
+        }
+    }
+
+    Ok(report)
+}
+
+/// End-to-end variant verification: executes cosine on `variant_array` against the
+/// same query used for the baseline and returns a [`VerificationReport`]. Returns
+/// `Err` if `kind` is [`VerificationKind::Lossless`] and the scores disagree beyond
+/// [`LOSSLESS_TOLERANCE`] — that indicates a real correctness bug, not a quality
+/// tradeoff. Uses the global [`vortex_bench::SESSION`].
+pub fn verify_variant(
+    variant_name: &str,
+    variant_array: &ArrayRef,
+    query: &[f32],
+    baseline_scores: &[f32],
+    kind: VerificationKind,
+) -> Result<VerificationReport> {
+    let scores = compute_cosine_scores(variant_array, query)?;
+    verify_and_report_scores(variant_name, &scores, baseline_scores, kind)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::Variant;
+    use crate::extract_query_row;
+    use crate::prepare_variant;
+    use crate::test_utils::synthetic_vector;
+
+    /// Build a `PreparedDataset` whose `query` is row 0 of the dataset. Using
+    /// `extract_query_row` here (rather than a test-local f32 extraction helper) also
+    /// keeps the test surface covered by the same ptype-assertion path the benchmark
+    /// hot path uses.
+    fn make_prepared(dim: u32, num_rows: usize, seed: u64) -> crate::PreparedDataset {
+        let uncompressed = synthetic_vector(dim, num_rows, seed);
+        let query = extract_query_row(&uncompressed, 0).unwrap();
+        assert_eq!(query.len(), dim as usize);
+        crate::PreparedDataset {
+            name: "synthetic".to_string(),
+            uncompressed,
+            query,
+            parquet_bytes: 0,
+        }
+    }
+
+    #[test]
+    fn compare_scores_handles_empty() {
+        let (mean, max) = compare_scores(&[], &[]);
+        assert_eq!(mean, 0.0);
+        assert_eq!(max, 0.0);
+    }
+
+    #[test]
+    fn compare_scores_computes_mae_and_max() {
+        let base = [0.0f32, 1.0, 2.0, 3.0];
+        let other = [0.0f32, 1.0, 2.5, 3.0];
+        let (mean, max) = compare_scores(&base, &other);
+        assert!((max - 0.5).abs() < 1e-9);
+        assert!((mean - 0.125).abs() < 1e-9);
+    }
+
+    #[test]
+    fn verify_scores_passes_for_identical_inputs() {
+        let base = [0.5f32; 10];
+        let report = verify_scores(&base, &base, VerificationKind::Lossless);
+        assert!(report.passed);
+        assert_eq!(report.max_abs_diff, 0.0);
+        assert_eq!(report.mean_abs_diff, 0.0);
+        assert_eq!(report.num_scores, 10);
+    }
+
+    #[test]
+    fn verify_scores_fails_for_lossless_beyond_tolerance() {
+        let base = [0.5f32; 10];
+        let mut other = [0.5f32; 10];
+        other[3] = 0.50001; // diff ≈ 1e-5, comfortably below the 1e-4 lossless bound
+        let report_ok = verify_scores(&base, &other, VerificationKind::Lossless);
+        assert!(
+            report_ok.passed,
+            "1e-5 drift should pass, got max={:.2e}",
+            report_ok.max_abs_diff
+        );
+
+        other[3] = 0.51; // diff of 0.01, well above 1e-4
+        let report_bad = verify_scores(&base, &other, VerificationKind::Lossless);
+        assert!(
+            !report_bad.passed,
+            "1e-2 drift should fail, got max={:.2e}",
+            report_bad.max_abs_diff
+        );
+    }
+
+    #[test]
+    fn verify_scores_lossy_tolerates_small_drift() {
+        let base = [0.9f32; 10];
+        let mut other = [0.9f32; 10];
+        other[0] = 1.0; // diff of 0.1
+        let report = verify_scores(&base, &other, VerificationKind::Lossy);
+        assert!(
+            report.passed,
+            "0.1 drift should pass lossy tolerance, got max={}",
+            report.max_abs_diff
+        );
+    }
+
+    #[test]
+    fn verify_scores_fails_on_nan() {
+        let base = [0.5f32, 0.5];
+        let other = [0.5f32, f32::NAN];
+        let report = verify_scores(&base, &other, VerificationKind::Lossless);
+        assert!(!report.passed);
+        assert!(report.max_abs_diff.is_infinite());
+    }
+
+    #[test]
+    fn verify_scores_fails_on_nan_in_baseline() {
+        // Symmetric case: NaN on the baseline side should also fail, not just variant.
+        let base = [0.5f32, f32::NAN];
+        let other = [0.5f32, 0.5];
+        let report = verify_scores(&base, &other, VerificationKind::Lossless);
+        assert!(!report.passed);
+        assert!(report.max_abs_diff.is_infinite());
+    }
+
+    #[test]
+    fn verify_and_report_scores_is_ok_for_identical_inputs() {
+        let base = [0.5f32; 10];
+        let report =
+            verify_and_report_scores("self", &base, &base, VerificationKind::Lossless).unwrap();
+        assert!(report.passed);
+        assert_eq!(report.max_abs_diff, 0.0);
+    }
+
+    #[test]
+    fn verify_and_report_scores_bails_for_lossless_mismatch() {
+        let base = [0.5f32; 10];
+        let mut other = [0.5f32; 10];
+        other[3] = 0.6;
+        let err =
+            verify_and_report_scores("broken-variant", &other, &base, VerificationKind::Lossless)
+                .unwrap_err()
+                .to_string();
+        assert!(
+            err.contains("broken-variant correctness check failed"),
+            "unexpected error: {err}"
+        );
+    }
+
+    #[test]
+    fn verify_and_report_scores_warns_for_lossy_mismatch_without_bailing() {
+        // A lossy variant outside its tolerance should NOT bail — it logs a warning
+        // and returns the failing report so the caller can still emit the
+        // measurement and show recall alongside it.
+        let base = [0.9f32; 10];
+        let mut other = [0.9f32; 10];
+        other[0] = 1.5; // diff of 0.6, above the 0.2 lossy tolerance
+        let report =
+            verify_and_report_scores("too-lossy-variant", &other, &base, VerificationKind::Lossy)
+                .expect("lossy failures should not bail");
+        assert!(!report.passed);
+        assert!(report.max_abs_diff > f64::from(LOSSY_TOLERANCE));
+    }
+
+    #[test]
+    fn vortex_default_matches_uncompressed_end_to_end() {
+        let dim = 128u32;
+        let num_rows = 64usize;
+        let prepared = make_prepared(dim, num_rows, 0xC0FFEE);
+
+        let baseline_scores =
+            compute_cosine_scores(&prepared.uncompressed, &prepared.query).unwrap();
+
+        let default_prep = prepare_variant(&prepared, Variant::VortexDefault).unwrap();
+        let report = verify_variant(
+            "vortex-default",
+            &default_prep.array,
+            &prepared.query,
+            &baseline_scores,
+            VerificationKind::Lossless,
+        )
+        .expect("vortex-default must be lossless against the uncompressed baseline");
+        assert!(report.passed);
+    }
+
+    #[test]
+    fn vortex_turboquant_stays_within_lossy_tolerance() {
+        let dim = 128u32;
+        let num_rows = 64usize;
+        let prepared = make_prepared(dim, num_rows, 0xDEADBEEF);
+
+        let baseline_scores =
+            compute_cosine_scores(&prepared.uncompressed, &prepared.query).unwrap();
+
+        let tq_prep = prepare_variant(&prepared, Variant::VortexTurboQuant).unwrap();
+        let report = verify_variant(
+            "vortex-turboquant",
+            &tq_prep.array,
+            &prepared.query,
+            &baseline_scores,
+            VerificationKind::Lossy,
+        )
+        .expect("TurboQuant verification should not error");
+        assert!(
+            report.passed,
+            "TurboQuant drift {:.4} exceeds lossy tolerance {:.4}",
+            report.max_abs_diff,
+            report.tolerance()
+        );
+    }
+}
diff --git a/vortex-bench/Cargo.toml b/vortex-bench/Cargo.toml
index 62d302f12e1..1859a64a16e 100644
--- a/vortex-bench/Cargo.toml
+++ b/vortex-bench/Cargo.toml
@@ -63,3 +63,4 @@ vortex = { workspace = true, features = [
     "tokio",
     "zstd",
 ] }
+vortex-tensor = { workspace = true }
diff --git a/vortex-bench/src/conversions.rs b/vortex-bench/src/conversions.rs
index 3f21ab30ba0..9811abc903d 100644
--- a/vortex-bench/src/conversions.rs
+++ b/vortex-bench/src/conversions.rs
@@ -21,17 +21,30 @@ use vortex::VortexSessionDefault;
 use vortex::array::ArrayRef;
 use vortex::array::IntoArray;
 use vortex::array::VortexSessionExecute;
+use vortex::array::arrays::Chunked;
 use vortex::array::arrays::ChunkedArray;
+use vortex::array::arrays::ExtensionArray;
+use vortex::array::arrays::FixedSizeListArray;
+use vortex::array::arrays::List;
+use vortex::array::arrays::ListView;
+use vortex::array::arrays::chunked::ChunkedArrayExt;
+use vortex::array::arrays::list::ListArrayExt;
+use vortex::array::arrays::listview::recursive_list_from_list_view;
 use vortex::array::arrow::FromArrowArray;
 use vortex::array::builders::builder_with_capacity;
+use vortex::array::extension::EmptyMetadata;
 use vortex::array::stream::ArrayStreamAdapter;
 use vortex::array::stream::ArrayStreamExt;
+use vortex::array::validity::Validity;
 use vortex::dtype::DType;
 use vortex::dtype::arrow::FromArrowType;
+use vortex::dtype::extension::ExtDType;
 use vortex::error::VortexResult;
+use vortex::error::vortex_bail;
 use vortex::error::vortex_err;
 use vortex::file::WriteOptionsSessionExt;
 use vortex::session::VortexSession;
+use vortex_tensor::vector::Vector;
 
 use crate::CompactionStrategy;
 use crate::Format;
@@ -222,3 +235,347 @@ pub async fn write_parquet_as_vortex(
     })
     .await
 }
+
+/// Rewrap a list-of-float column as a [`vortex_tensor::vector::Vector`] extension array.
+///
+/// Parquet has no fixed-size list logical type, so an embedding column ingested via
+/// [`parquet_to_vortex_chunks`] arrives as `List<f32>` (or `List<f64>` / `List<f16>`) even
+/// when every row has the same length. This helper validates that every list in `input`
+/// has the same length `D` and reconstructs the column as
+/// `Extension<Vector>(FixedSizeList<T, D>)` — the shape expected by the vector-search
+/// scalar functions in `vortex-tensor`.
+///
+/// The input may be either a single [`List`] array or a [`Chunked`] array of lists (the
+/// common case after `parquet_to_vortex_chunks`). Chunked inputs are converted chunk-by-chunk
+/// and reassembled as a [`ChunkedArray`] of `Extension<Vector>`.
+///
+/// # Errors
+///
+/// Returns an error if:
+/// - `input` is not a `List` or `Chunked` array.
+/// - The element type is not a non-nullable float primitive (`f16`, `f32`, or `f64`).
+/// - Any row has a different length than the first row.
+/// - The list validity is nullable (vector elements cannot be null at the row level).
+/// - The input has zero rows (the dimension cannot be inferred from empty input).
+pub fn list_to_vector_ext(input: ArrayRef) -> VortexResult<ArrayRef> {
+    if let Some(chunked) = input.as_opt::<Chunked>() {
+        let converted: Vec<ArrayRef> = chunked
+            .iter_chunks()
+            .map(|chunk| list_to_vector_ext(chunk.clone()))
+            .collect::<VortexResult<_>>()?;
+        if converted.is_empty() {
+            vortex_bail!("list_to_vector_ext: chunked input has no chunks");
+        }
+        return Ok(ChunkedArray::from_iter(converted).into_array());
+    }
+
+    // `parquet_to_vortex_chunks` produces `ListView` arrays for list columns by default;
+    // materialize them into a flat `List` representation before we validate offsets.
+    if input.as_opt::<ListView>().is_some() {
+        let flat = recursive_list_from_list_view(input)?;
+        return list_to_vector_ext(flat);
+    }
+
+    let Some(list) = input.as_opt::<List>() else {
+        vortex_bail!(
+            "list_to_vector_ext: expected a List array, got dtype {}",
+            input.dtype()
+        );
+    };
+
+    if !matches!(
+        list.list_validity(),
+        Validity::NonNullable | Validity::AllValid
+    ) {
+        vortex_bail!(
+            "list_to_vector_ext: list rows must be non-nullable for Vector extension wrapping"
+        );
+    }
+
+    let element_dtype = list.element_dtype().clone();
+    let DType::Primitive(ptype, elem_nullability) = &element_dtype else {
+        vortex_bail!(
+            "list_to_vector_ext: element dtype must be a primitive float, got {}",
+            element_dtype
+        );
+    };
+    if !ptype.is_float() {
+        vortex_bail!(
+            "list_to_vector_ext: element type must be float (f16/f32/f64), got {}",
+            ptype
+        );
+    }
+    if elem_nullability.is_nullable() {
+        vortex_bail!(
+            "list_to_vector_ext: element type must be non-nullable, got nullable {}",
+            ptype
+        );
+    }
+
+    let num_rows = input.len();
+    if num_rows == 0 {
+        vortex_bail!("list_to_vector_ext: cannot infer vector dimension from empty input");
+    }
+
+    // Walk the offsets array once, reusing the previous iteration's `end` as the
+    // next iteration's `start`. Each `offset_at` call goes through
+    // `ListArrayExt::offset_at`, which has a fast path when the offsets child is a
+    // `Primitive` array (direct slice index). That's the common case after
+    // `parquet_to_vortex_chunks`, so for a 100K-row column we do ~100K primitive
+    // slice indexes rather than 200K. The loop body is O(1) either way.
+    let mut prev_end = list.offset_at(0)?;
+    let first_end = list.offset_at(1)?;
+    let dim = first_end.checked_sub(prev_end).ok_or_else(|| {
+        vortex_err!("list_to_vector_ext: offsets are not monotonically increasing")
+    })?;
+    if dim == 0 {
+        vortex_bail!("list_to_vector_ext: first row has zero elements");
+    }
+    prev_end = first_end;
+
+    for i in 1..num_rows {
+        let end = list.offset_at(i + 1)?;
+        let row_len = end.checked_sub(prev_end).ok_or_else(|| {
+            vortex_err!("list_to_vector_ext: offsets are not monotonically increasing")
+        })?;
+        if row_len != dim {
+            vortex_bail!(
+                "list_to_vector_ext: row {} has length {} but expected {}",
+                i,
+                row_len,
+                dim
+            );
+        }
+        prev_end = end;
+    }
+
+    let elements = list.sliced_elements()?;
+    let expected_elements = num_rows
+        .checked_mul(dim)
+        .ok_or_else(|| vortex_err!("list_to_vector_ext: num_rows * dim overflows usize"))?;
+    if elements.len() != expected_elements {
+        vortex_bail!(
+            "list_to_vector_ext: elements buffer has length {} but expected {}",
+            elements.len(),
+            expected_elements
+        );
+    }
+
+    let dim_u32 = u32::try_from(dim)
+        .map_err(|_| vortex_err!("list_to_vector_ext: dimension {dim} does not fit in u32"))?;
+
+    let fsl = FixedSizeListArray::try_new(elements, dim_u32, Validity::NonNullable, num_rows)?;
+    let ext_dtype = ExtDType::<Vector>::try_new(EmptyMetadata, fsl.dtype().clone())?.erased();
+    Ok(ExtensionArray::new(ext_dtype, fsl.into_array()).into_array())
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex::array::IntoArray;
+    use vortex::array::arrays::Extension;
+    use vortex::array::arrays::List;
+    use vortex::array::arrays::ListViewArray;
+    use vortex::array::arrays::PrimitiveArray;
+    use vortex::array::arrays::extension::ExtensionArrayExt;
+    use vortex::array::validity::Validity;
+    use vortex::buffer::BufferMut;
+    use vortex::dtype::DType;
+
+    use super::list_to_vector_ext;
+
+    fn list_f32(rows: &[&[f32]]) -> vortex::array::ArrayRef {
+        let mut elements = BufferMut::<f32>::with_capacity(rows.iter().map(|r| r.len()).sum());
+        let mut offsets = BufferMut::<i32>::with_capacity(rows.len() + 1);
+        offsets.push(0);
+        for row in rows {
+            for &v in row.iter() {
+                elements.push(v);
+            }
+            offsets.push(i32::try_from(elements.len()).unwrap());
+        }
+
+        let elements_array =
+            PrimitiveArray::new::<f32>(elements.freeze(), Validity::NonNullable).into_array();
+        let offsets_array =
+            PrimitiveArray::new::<i32>(offsets.freeze(), Validity::NonNullable).into_array();
+        vortex::array::Array::<List>::new(elements_array, offsets_array, Validity::NonNullable)
+            .into_array()
+    }
+
+    #[test]
+    fn uniform_list_becomes_vector_extension() {
+        let list = list_f32(&[&[1.0, 2.0, 3.0], &[4.0, 5.0, 6.0], &[7.0, 8.0, 9.0]]);
+        let wrapped = list_to_vector_ext(list).unwrap();
+        assert_eq!(wrapped.len(), 3);
+        let ext = wrapped.as_opt::<Extension>().expect("returns Extension");
+        assert!(matches!(
+            ext.storage_array().dtype(),
+            DType::FixedSizeList(_, 3, _)
+        ));
+    }
+
+    #[test]
+    fn mismatched_row_length_is_rejected() {
+        let list = list_f32(&[&[1.0, 2.0, 3.0], &[4.0, 5.0]]);
+        let err = list_to_vector_ext(list).unwrap_err().to_string();
+        assert!(
+            err.contains("row 1 has length 2 but expected 3"),
+            "unexpected error: {err}",
+        );
+    }
+
+    #[test]
+    fn non_list_input_is_rejected() {
+        let primitive = PrimitiveArray::new::<f32>(
+            BufferMut::<f32>::from_iter([1.0f32, 2.0, 3.0]).freeze(),
+            Validity::NonNullable,
+        )
+        .into_array();
+        let err = list_to_vector_ext(primitive).unwrap_err().to_string();
+        assert!(
+            err.contains("expected a List array"),
+            "unexpected error: {err}"
+        );
+    }
+
+    #[test]
+    fn empty_input_is_rejected() {
+        let list = list_f32(&[]);
+        let err = list_to_vector_ext(list).unwrap_err().to_string();
+        assert!(
+            err.contains("cannot infer vector dimension from empty input"),
+            "unexpected error: {err}",
+        );
+    }
+
+    /// Build a `ListView<f32>` whose every row is a length-`dim` slice of the flattened
+    /// `values` buffer. This shape matches what `parquet_to_vortex_chunks` produces for
+    /// embedding columns after arrow-rs' canonicalization, and exercises the
+    /// `list_to_vector_ext` fast-path that collapses `ListView` → `List` before
+    /// validating offsets.
+    fn list_view_f32(dim: usize, rows: &[&[f32]]) -> vortex::array::ArrayRef {
+        let mut values = BufferMut::<f32>::with_capacity(rows.len() * dim);
+        for row in rows {
+            assert_eq!(row.len(), dim);
+            for &v in row.iter() {
+                values.push(v);
+            }
+        }
+        let elements =
+            PrimitiveArray::new::<f32>(values.freeze(), Validity::NonNullable).into_array();
+
+        let dim_i32 = i32::try_from(dim).unwrap();
+        let num_rows = rows.len();
+
+        let mut offsets_buf = BufferMut::<i32>::with_capacity(num_rows);
+        for i in 0..num_rows {
+            offsets_buf.push(i32::try_from(i).unwrap() * dim_i32);
+        }
+        let offsets =
+            PrimitiveArray::new::<i32>(offsets_buf.freeze(), Validity::NonNullable).into_array();
+
+        let mut sizes_buf = BufferMut::<i32>::with_capacity(num_rows);
+        for _ in 0..num_rows {
+            sizes_buf.push(dim_i32);
+        }
+        let sizes =
+            PrimitiveArray::new::<i32>(sizes_buf.freeze(), Validity::NonNullable).into_array();
+
+        ListViewArray::try_new(elements, offsets, sizes, Validity::NonNullable)
+            .unwrap()
+            .into_array()
+    }
+
+    #[test]
+    fn list_view_input_is_rewrapped_as_vector_extension() {
+        // Simulates the post-parquet-ingest shape: the `emb` column arrives as a
+        // ListView, not a List. `list_to_vector_ext` must materialize it via
+        // `recursive_list_from_list_view` and then validate offsets on the flattened
+        // `List` form.
+        let list_view = list_view_f32(3, &[&[1.0, 2.0, 3.0], &[4.0, 5.0, 6.0]]);
+        let wrapped = list_to_vector_ext(list_view).unwrap();
+        assert_eq!(wrapped.len(), 2);
+        let ext = wrapped.as_opt::<Extension>().expect("returns Extension");
+        assert!(matches!(
+            ext.storage_array().dtype(),
+            DType::FixedSizeList(_, 3, _)
+        ));
+    }
+
+    #[test]
+    fn all_invalid_list_validity_is_rejected() {
+        // A list with `Validity::AllInvalid` means every row is null. The Vector
+        // extension type requires non-nullable elements at the FSL level, so we
+        // must reject this input rather than silently dropping the validity mask.
+        let elements = PrimitiveArray::new::<f32>(
+            BufferMut::<f32>::from_iter([1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0]).freeze(),
+            Validity::NonNullable,
+        )
+        .into_array();
+        let offsets = PrimitiveArray::new::<i32>(
+            BufferMut::<i32>::from_iter([0i32, 3, 6]).freeze(),
+            Validity::NonNullable,
+        )
+        .into_array();
+        let list =
+            vortex::array::Array::<List>::new(elements, offsets, Validity::AllInvalid).into_array();
+
+        let err = list_to_vector_ext(list).unwrap_err().to_string();
+        assert!(
+            err.contains("list rows must be non-nullable"),
+            "unexpected error: {err}"
+        );
+    }
+
+    #[test]
+    fn non_float_element_type_is_rejected() {
+        // Build a List<i32>.
+        let elements = PrimitiveArray::new::<i32>(
+            BufferMut::<i32>::from_iter([1i32, 2, 3, 4]).freeze(),
+            Validity::NonNullable,
+        )
+        .into_array();
+        let offsets = PrimitiveArray::new::<i32>(
+            BufferMut::<i32>::from_iter([0i32, 2, 4]).freeze(),
+            Validity::NonNullable,
+        )
+        .into_array();
+        let list = vortex::array::Array::<List>::new(elements, offsets, Validity::NonNullable)
+            .into_array();
+
+        let err = list_to_vector_ext(list).unwrap_err().to_string();
+        assert!(
+            err.contains("element type must be float"),
+            "unexpected error: {err}",
+        );
+    }
+
+    #[test]
+    fn nullable_element_dtype_is_rejected() {
+        // Build a `List<f32?>` — a list whose elements have nullable dtype (even
+        // if every value happens to be present). The `Vector` extension type at the
+        // FSL level requires non-nullable elements, so this must be rejected.
+        //
+        // Passing `Validity::AllValid` to `PrimitiveArray::new` sets the ptype's
+        // nullability to `Nullable`, which is what triggers the rejection path even
+        // though every value is technically valid.
+        let elements = PrimitiveArray::new::<f32>(
+            BufferMut::<f32>::from_iter([1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0]).freeze(),
+            Validity::AllValid,
+        )
+        .into_array();
+        let offsets = PrimitiveArray::new::<i32>(
+            BufferMut::<i32>::from_iter([0i32, 3, 6]).freeze(),
+            Validity::NonNullable,
+        )
+        .into_array();
+        let list = vortex::array::Array::<List>::new(elements, offsets, Validity::NonNullable)
+            .into_array();
+
+        let err = list_to_vector_ext(list).unwrap_err().to_string();
+        assert!(
+            err.contains("element type must be non-nullable"),
+            "unexpected error: {err}"
+        );
+    }
+}
diff --git a/vortex-bench/src/lib.rs b/vortex-bench/src/lib.rs
index af0d3fdef30..3637e050131 100644
--- a/vortex-bench/src/lib.rs
+++ b/vortex-bench/src/lib.rs
@@ -53,6 +53,7 @@ pub mod statpopgen;
 pub mod tpcds;
 pub mod tpch;
 pub mod utils;
+pub mod vector_dataset;
 
 pub use benchmark::Benchmark;
 pub use benchmark::TableSpec;
@@ -138,6 +139,13 @@ pub enum Format {
     #[clap(name = "vortex-compact")]
     #[serde(rename = "vortex-compact")]
     VortexCompact,
+    /// Vortex file that uses one or more lossy encodings — e.g. TurboQuant
+    /// vector quantization, or a future product/binary/scalar quantizer.
+    /// Distinguishes lossy-compressed vector data from the lossless
+    /// [`Format::OnDiskVortex`] layout on downstream dashboards.
+    #[clap(name = "vortex-lossy")]
+    #[serde(rename = "vortex-lossy")]
+    VortexLossy,
     #[clap(name = "duckdb")]
     #[serde(rename = "duckdb")]
     OnDiskDuckDB,
@@ -177,6 +185,7 @@ impl Format {
             Format::Parquet => "parquet",
             Format::OnDiskVortex => "vortex-file-compressed",
             Format::VortexCompact => "vortex-compact",
+            Format::VortexLossy => "vortex-lossy",
             Format::OnDiskDuckDB => "duckdb",
             Format::Lance => "lance",
         }
@@ -189,6 +198,7 @@ impl Format {
             Format::Parquet => "parquet",
             Format::OnDiskVortex => "vortex",
             Format::VortexCompact => "vortex",
+            Format::VortexLossy => "vortex",
             Format::OnDiskDuckDB => "duckdb",
             Format::Lance => "lance",
         }
diff --git a/vortex-bench/src/measurements.rs b/vortex-bench/src/measurements.rs
index f49349cd95e..3abd5a2d7c7 100644
--- a/vortex-bench/src/measurements.rs
+++ b/vortex-bench/src/measurements.rs
@@ -347,11 +347,11 @@ pub struct CompressionTimingMeasurement {
 impl ToJson for CompressionTimingMeasurement {
     fn to_json(&self) -> serde_json::Value {
         let (name, engine) = match self.format {
-            Format::OnDiskVortex => (self.name.to_string(), Engine::Vortex),
+            Format::OnDiskVortex | Format::VortexLossy => (self.name.to_string(), Engine::Vortex),
             Format::Parquet => (format!("parquet_rs-zstd {}", self.name), Engine::Arrow),
             Format::Lance => (format!("lance {}", self.name), Engine::Arrow),
             _ => vortex_panic!(
-                "CompressionTimingMeasurement only supports vortex, lance, and parquet formats"
+                "CompressionTimingMeasurement only supports vortex, vortex-lossy, lance, and parquet formats"
             ),
         };
 
@@ -392,7 +392,7 @@ pub struct CustomUnitMeasurement {
 impl ToJson for CustomUnitMeasurement {
     fn to_json(&self) -> serde_json::Value {
         let engine = match self.format {
-            Format::OnDiskVortex | Format::VortexCompact => Engine::Vortex,
+            Format::OnDiskVortex | Format::VortexCompact | Format::VortexLossy => Engine::Vortex,
             Format::Parquet => Engine::Arrow,
             Format::Lance => Engine::Arrow,
             _ => Engine::Vortex, // Default to Vortex for other formats.
diff --git a/vortex-bench/src/vector_dataset.rs b/vortex-bench/src/vector_dataset.rs
new file mode 100644
index 00000000000..e6b049fe614
--- /dev/null
+++ b/vortex-bench/src/vector_dataset.rs
@@ -0,0 +1,241 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Datasets used by the vector-search benchmark.
+//!
+//! These are a subset of the public VectorDBBench
+//! (<https://github.com/zilliztech/VectorDBBench>) datasets — MIT-licensed canonical
+//! embedding corpora published by Zilliz under
+//! `https://assets.zilliz.com/benchmark/<dir>/`. Each dataset is distributed as one or more
+//! parquet files with a `emb: list<float>` column (the raw embedding vectors) and an
+//! `id: int64` column.
+//!
+//! The URL constants below point at the upstream Zilliz bucket. For CI runs we recommend
+//! mirroring these files into an internal bucket first to avoid repeated egress charges on
+//! a third-party bucket — mirror setup is a one-off manual operation and documented in the
+//! vector-search-bench crate README.
+
+use std::path::PathBuf;
+
+use anyhow::Result;
+use anyhow::bail;
+use async_trait::async_trait;
+use vortex::array::ArrayRef;
+
+use crate::IdempotentPath;
+use crate::datasets::Dataset;
+use crate::datasets::data_downloads::download_data;
+
+/// A public embedding-vector dataset used by the vector-search benchmark.
+///
+/// Each variant is one of the canonical VectorDBBench corpora, distributed as parquet under
+/// the Zilliz public benchmark bucket. The smaller `*Small` sizes are appropriate for CI
+/// runs; the larger sizes are intended for local / on-demand experiments. Only
+/// cosine-metric datasets are wired today — SIFT / GIST / LAION (L2) will follow when an
+/// L2-distance scalar function lands in `vortex-tensor`.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum VectorDataset {
+    /// Cohere wiki-22-12, 100K rows × 768 dims, cosine metric. ~307 MB raw / ~150 MB
+    /// zstd-parquet — the default CI-friendly size.
+    CohereSmall,
+    /// Cohere wiki-22-12, 1M rows × 768 dims, cosine metric. ~3 GB raw / ~1.5 GB
+    /// zstd-parquet. Local-only by default.
+    CohereMedium,
+    /// OpenAI embeddings on C4, 50K rows × 1536 dims, cosine metric. ~307 MB raw —
+    /// the smallest OpenAI variant and comparable in size to Cohere-small, but with
+    /// double the dimensionality.
+    OpenAiSmall,
+    /// OpenAI embeddings on C4, 500K rows × 1536 dims, cosine metric. ~3 GB raw.
+    /// Local-only by default.
+    OpenAiMedium,
+    /// Bioasq biomedical embeddings, 1M rows × 1024 dims, cosine metric. ~4 GB raw.
+    /// Local-only by default.
+    BioasqMedium,
+    /// Glove word embeddings, 1M rows × 200 dims, cosine metric. ~800 MB raw.
+    GloveMedium,
+}
+
+/// All built-in [`VectorDataset`] variants in a fixed order. Convenient for iterating or
+/// for listing choices in CLI help.
+pub const ALL_VECTOR_DATASETS: &[VectorDataset] = &[
+    VectorDataset::CohereSmall,
+    VectorDataset::CohereMedium,
+    VectorDataset::OpenAiSmall,
+    VectorDataset::OpenAiMedium,
+    VectorDataset::BioasqMedium,
+    VectorDataset::GloveMedium,
+];
+
+impl VectorDataset {
+    /// The upstream URL for this dataset's canonical train-split parquet file.
+    ///
+    /// **CI note**: point at an internal mirror before enabling this benchmark in CI —
+    /// see `benchmarks/vector-search-bench/README.md` for the procedure.
+    pub fn parquet_url(&self) -> &'static str {
+        match self {
+            VectorDataset::CohereSmall => {
+                "https://assets.zilliz.com/benchmark/cohere_small_100k/train.parquet"
+            }
+            VectorDataset::CohereMedium => {
+                "https://assets.zilliz.com/benchmark/cohere_medium_1m/train.parquet"
+            }
+            VectorDataset::OpenAiSmall => {
+                "https://assets.zilliz.com/benchmark/openai_small_50k/train.parquet"
+            }
+            VectorDataset::OpenAiMedium => {
+                "https://assets.zilliz.com/benchmark/openai_medium_500k/train.parquet"
+            }
+            VectorDataset::BioasqMedium => {
+                "https://assets.zilliz.com/benchmark/bioasq_medium_1m/train.parquet"
+            }
+            VectorDataset::GloveMedium => {
+                "https://assets.zilliz.com/benchmark/glove_medium_1m/train.parquet"
+            }
+        }
+    }
+
+    /// Fixed vector dimensionality for this dataset.
+    pub fn dim(&self) -> u32 {
+        match self {
+            VectorDataset::CohereSmall | VectorDataset::CohereMedium => 768,
+            VectorDataset::OpenAiSmall | VectorDataset::OpenAiMedium => 1536,
+            VectorDataset::BioasqMedium => 1024,
+            VectorDataset::GloveMedium => 200,
+        }
+    }
+
+    /// Expected number of rows in the train split.
+    pub fn num_rows(&self) -> usize {
+        match self {
+            VectorDataset::CohereSmall => 100_000,
+            VectorDataset::CohereMedium => 1_000_000,
+            VectorDataset::OpenAiSmall => 50_000,
+            VectorDataset::OpenAiMedium => 500_000,
+            VectorDataset::BioasqMedium => 1_000_000,
+            VectorDataset::GloveMedium => 1_000_000,
+        }
+    }
+
+    /// The distance metric the upstream dataset was curated for. v1 only wires cosine, so
+    /// every built-in dataset returns [`VectorMetric::Cosine`]. The enum variant exists so
+    /// that L2 / inner-product datasets can be added later without a breaking change.
+    pub fn metric(&self) -> VectorMetric {
+        match self {
+            VectorDataset::CohereSmall
+            | VectorDataset::CohereMedium
+            | VectorDataset::OpenAiSmall
+            | VectorDataset::OpenAiMedium
+            | VectorDataset::BioasqMedium
+            | VectorDataset::GloveMedium => VectorMetric::Cosine,
+        }
+    }
+}
+
+/// Distance metric a dataset was curated for. The vector-search benchmark only wires cosine
+/// today, but having this explicit makes it obvious when a future dataset should be paired
+/// with L2 or inner-product instead.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum VectorMetric {
+    /// Cosine similarity: `dot(a, b) / (||a|| * ||b||)`.
+    Cosine,
+    /// Squared L2 distance: `sum((a - b)^2)`.
+    L2,
+    /// Inner product: `dot(a, b)`.
+    InnerProduct,
+}
+
+#[async_trait]
+impl Dataset for VectorDataset {
+    fn name(&self) -> &str {
+        match self {
+            VectorDataset::CohereSmall => "cohere-small",
+            VectorDataset::CohereMedium => "cohere-medium",
+            VectorDataset::OpenAiSmall => "openai-small",
+            VectorDataset::OpenAiMedium => "openai-medium",
+            VectorDataset::BioasqMedium => "bioasq-medium",
+            VectorDataset::GloveMedium => "glove-medium",
+        }
+    }
+
+    async fn to_parquet_path(&self) -> Result<PathBuf> {
+        let dir = format!("{}/", self.name()).to_data_path();
+        let parquet = dir.join(format!("{}.parquet", self.name()));
+        download_data(parquet.clone(), self.parquet_url()).await?;
+        Ok(parquet)
+    }
+
+    /// **Not supported.** `VectorDataset` can't return a straight Vortex array via
+    /// [`Dataset::to_vortex_array`] because:
+    ///
+    /// - The struct-shaped array the other datasets return would arrive as
+    ///   `{ id: int64, emb: list<float> }` — with `emb` as a *list*, not the
+    ///   `Extension<Vector>(FixedSizeList<...>)` shape the vector-search benchmark
+    ///   actually operates on.
+    /// - The benchmark therefore bypasses this method entirely: it calls
+    ///   [`Dataset::to_parquet_path`] and then runs
+    ///   [`crate::conversions::parquet_to_vortex_chunks`] +
+    ///   [`crate::conversions::list_to_vector_ext`] itself, which produces the
+    ///   correct `Extension<Vector>` shape.
+    ///
+    /// Returning the raw struct here would be a trap for future callers who expect
+    /// the same semantic shape the benchmark measures. Bailing explicitly makes the
+    /// contract unambiguous.
+    async fn to_vortex_array(&self) -> Result<ArrayRef> {
+        bail!(
+            "VectorDataset::to_vortex_array is not supported; use `to_parquet_path` + \
+             `parquet_to_vortex_chunks` + `list_to_vector_ext` to build the \
+             Extension<Vector> shape the benchmark needs"
+        );
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex::utils::aliases::hash_set::HashSet;
+
+    use super::ALL_VECTOR_DATASETS;
+    use super::VectorDataset;
+    use super::VectorMetric;
+    use crate::datasets::Dataset;
+
+    #[test]
+    fn cohere_small_metadata() {
+        let ds = VectorDataset::CohereSmall;
+        assert_eq!(ds.name(), "cohere-small");
+        assert_eq!(ds.dim(), 768);
+        assert_eq!(ds.num_rows(), 100_000);
+        assert_eq!(ds.metric(), VectorMetric::Cosine);
+        assert!(ds.parquet_url().ends_with("/train.parquet"));
+        assert!(ds.parquet_url().contains("cohere_small_100k"));
+    }
+
+    #[test]
+    fn all_datasets_have_consistent_metadata() {
+        // Every built-in dataset must have a unique kebab-cased name, point at a
+        // `train.parquet` file under `assets.zilliz.com/benchmark/`, declare a
+        // dimension ≥ `MIN_DIMENSION` for TurboQuant, a non-zero row count, and
+        // (for v1) cosine metric.
+        let mut seen_names: HashSet<String> = HashSet::default();
+        for &ds in ALL_VECTOR_DATASETS {
+            let name = ds.name();
+            assert!(
+                seen_names.insert(name.to_string()),
+                "duplicate dataset name {name}",
+            );
+            assert!(
+                ds.dim() >= 128,
+                "{name} dim {} below TurboQuant minimum",
+                ds.dim()
+            );
+            assert!(ds.num_rows() > 0, "{name} has zero rows");
+            assert_eq!(
+                ds.metric(),
+                VectorMetric::Cosine,
+                "{name} must be cosine for v1"
+            );
+            let url = ds.parquet_url();
+            assert!(url.starts_with("https://assets.zilliz.com/benchmark/"));
+            assert!(url.ends_with("/train.parquet"));
+        }
+    }
+}
diff --git a/vortex-tensor/benches/similarity_search_common/mod.rs b/vortex-tensor/benches/similarity_search_common/mod.rs
index c22cb5a9f08..b95867b26b7 100644
--- a/vortex-tensor/benches/similarity_search_common/mod.rs
+++ b/vortex-tensor/benches/similarity_search_common/mod.rs
@@ -30,22 +30,14 @@ use vortex_array::ArrayRef;
 use vortex_array::ExecutionCtx;
 use vortex_array::IntoArray;
 use vortex_array::VortexSessionExecute;
-use vortex_array::arrays::ConstantArray;
 use vortex_array::arrays::Extension;
 use vortex_array::arrays::ExtensionArray;
 use vortex_array::arrays::FixedSizeListArray;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::extension::ExtensionArrayExt;
 use vortex_array::arrays::fixed_size_list::FixedSizeListArrayExt;
-use vortex_array::arrays::scalar_fn::ScalarFnArrayExt;
-use vortex_array::builtins::ArrayBuiltins;
-use vortex_array::dtype::DType;
-use vortex_array::dtype::Nullability;
-use vortex_array::dtype::PType;
 use vortex_array::dtype::extension::ExtDType;
 use vortex_array::extension::EmptyMetadata;
-use vortex_array::scalar::Scalar;
-use vortex_array::scalar_fn::fns::operators::Operator;
 use vortex_array::session::ArraySession;
 use vortex_array::validity::Validity;
 use vortex_btrblocks::BtrBlocksCompressor;
@@ -54,12 +46,9 @@ use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 use vortex_error::vortex_panic;
 use vortex_session::VortexSession;
-use vortex_tensor::encodings::turboquant::TurboQuantConfig;
-use vortex_tensor::encodings::turboquant::turboquant_encode_unchecked;
-use vortex_tensor::scalar_fns::cosine_similarity::CosineSimilarity;
-use vortex_tensor::scalar_fns::l2_denorm::L2Denorm;
-use vortex_tensor::scalar_fns::l2_denorm::normalize_as_l2_denorm;
 use vortex_tensor::vector::Vector;
+pub use vortex_tensor::vector_search::build_similarity_search_tree;
+pub use vortex_tensor::vector_search::compress_turboquant;
 
 /// A shared [`VortexSession`] pre-loaded with the builtin [`ArraySession`] so both bench and
 /// example can create execution contexts cheaply.
@@ -146,67 +135,20 @@ pub fn extract_row_as_query(vectors: &ArrayRef, row: usize, dim: u32) -> Vec<f32
     slice[start..start + dim_usize].to_vec()
 }
 
-/// Build a `Vector<dim, f32>` extension array whose storage is a [`ConstantArray`] broadcasting a
-/// single query vector across `num_rows` rows. This is how we hand a single query vector to
-/// `CosineSimilarity` on the `rhs` side -- `ScalarFnArray` requires both children to have the
-/// same length, so we broadcast the query instead of hand-rolling a 1-row input.
-fn build_constant_query_vector(query: &[f32], num_rows: usize) -> VortexResult<ArrayRef> {
-    let element_dtype = DType::Primitive(PType::F32, Nullability::NonNullable);
-
-    let children: Vec<Scalar> = query
-        .iter()
-        .map(|&v| Scalar::primitive(v, Nullability::NonNullable))
-        .collect();
-    let storage_scalar = Scalar::fixed_size_list(element_dtype, children, Nullability::NonNullable);
-
-    let storage = ConstantArray::new(storage_scalar, num_rows).into_array();
-
-    let ext_dtype = ExtDType::<Vector>::try_new(EmptyMetadata, storage.dtype().clone())?.erased();
-    Ok(ExtensionArray::new(ext_dtype, storage).into_array())
-}
-
 /// Compresses a raw `Vector<dim, f32>` array with the default BtrBlocks pipeline.
 ///
 /// [`BtrBlocksCompressor`] walks into the extension array and recursively compresses the
 /// underlying FSL storage child. TurboQuant is *not* exercised by this path -- it is not
 /// registered in the default scheme set -- so this measures "generic" lossless compression
 /// applied to float vectors.
+///
+/// Stays in this bench-only module because `BtrBlocksCompressor` is a dev-dependency of
+/// `vortex-tensor`, so promoting it to the public `vector_search` module would drag the
+/// `vortex-btrblocks` dep into `vortex-tensor`'s main dependency list.
 pub fn compress_default(data: ArrayRef) -> VortexResult<ArrayRef> {
     BtrBlocksCompressor::default().compress(&data)
 }
 
-/// Compresses a raw `Vector<dim, f32>` array with the TurboQuant pipeline by hand, producing the
-/// same tree shape that
-/// [`vortex_tensor::encodings::turboquant::TurboQuantScheme`] would:
-///
-/// ```text
-/// L2Denorm(SorfTransform(FSL(Dict(codes, centroids))), norms)
-/// ```
-///
-/// Calling the encode helpers directly (instead of going through
-/// `BtrBlocksCompressorBuilder::with_turboquant()`) lets this example avoid depending on the
-/// `unstable_encodings` feature flag.
-///
-/// See `vortex-tensor/src/encodings/turboquant/tests/mod.rs::normalize_and_encode` for the same
-/// canonical recipe.
-pub fn compress_turboquant(data: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef> {
-    let l2_denorm = normalize_as_l2_denorm(data, ctx)?;
-    let normalized = l2_denorm.child_at(0).clone();
-    let norms = l2_denorm.child_at(1).clone();
-    let num_rows = l2_denorm.len();
-
-    let normalized_ext = normalized
-        .as_opt::<Extension>()
-        .vortex_expect("normalized child should be an Extension array");
-
-    let config = TurboQuantConfig::default();
-    // SAFETY: `normalize_as_l2_denorm` guarantees every row is unit-norm (or zero), which is the
-    // invariant `turboquant_encode_unchecked` expects.
-    let tq = unsafe { turboquant_encode_unchecked(normalized_ext, &config, ctx) }?;
-
-    Ok(unsafe { L2Denorm::new_array_unchecked(tq, norms, num_rows) }?.into_array())
-}
-
 /// Dispatch helper that builds the data array for the requested [`Variant`], starting from a
 /// single random-vector generation. Always returns an `ArrayRef` whose logical dtype is
 /// `Vector<dim, f32>`.
@@ -224,33 +166,3 @@ pub fn build_variant(
         Variant::TurboQuant => compress_turboquant(raw, ctx),
     }
 }
-
-/// Build the lazy similarity-search array tree for a prepared data array and a single query
-/// vector. The returned tree is a boolean array of length `data.len()` where position `i` is
-/// `true` iff `cosine_similarity(data[i], query) > threshold`.
-///
-/// The tree shape is:
-///
-/// ```text
-/// Binary(Gt, [
-///     CosineSimilarity([data, ConstantArray(query_vec, n)]),
-///     ConstantArray(threshold, n),
-/// ])
-/// ```
-///
-/// This function does no execution; it is safe to call inside a benchmark setup closure.
-pub fn build_similarity_search_tree(
-    data: ArrayRef,
-    query: &[f32],
-    threshold: f32,
-) -> VortexResult<ArrayRef> {
-    let num_rows = data.len();
-    let query_vec = build_constant_query_vector(query, num_rows)?;
-
-    let cosine = CosineSimilarity::try_new_array(data, query_vec, num_rows)?.into_array();
-
-    let threshold_scalar = Scalar::primitive(threshold, Nullability::NonNullable);
-    let threshold_array = ConstantArray::new(threshold_scalar, num_rows).into_array();
-
-    cosine.binary(threshold_array, Operator::Gt)
-}
diff --git a/vortex-tensor/public-api.lock b/vortex-tensor/public-api.lock
index bec8df1cb29..90aea23a194 100644
--- a/vortex-tensor/public-api.lock
+++ b/vortex-tensor/public-api.lock
@@ -550,4 +550,12 @@ impl core::marker::Copy for vortex_tensor::vector::VectorMatcherMetadata
 
 impl core::marker::StructuralPartialEq for vortex_tensor::vector::VectorMatcherMetadata
 
+pub mod vortex_tensor::vector_search
+
+pub fn vortex_tensor::vector_search::build_constant_query_vector(query: &[f32], num_rows: usize) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_tensor::vector_search::build_similarity_search_tree(data: vortex_array::array::erased::ArrayRef, query: &[f32], threshold: f32) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_tensor::vector_search::compress_turboquant(data: vortex_array::array::erased::ArrayRef, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
 pub fn vortex_tensor::initialize(session: &vortex_session::VortexSession)
diff --git a/vortex-tensor/src/lib.rs b/vortex-tensor/src/lib.rs
index 3d3563aa8e4..b3cf6c21695 100644
--- a/vortex-tensor/src/lib.rs
+++ b/vortex-tensor/src/lib.rs
@@ -25,6 +25,8 @@ pub mod vector;
 
 pub mod encodings;
 
+pub mod vector_search;
+
 mod utils;
 
 /// Initialize the Vortex tensor library with a Vortex session.
diff --git a/vortex-tensor/src/vector_search.rs b/vortex-tensor/src/vector_search.rs
new file mode 100644
index 00000000000..6934fa52a71
--- /dev/null
+++ b/vortex-tensor/src/vector_search.rs
@@ -0,0 +1,298 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Reusable helpers for building brute-force vector similarity search expressions over
+//! [`Vector`] extension arrays.
+//!
+//! This module exposes three small building blocks that together make it straightforward to
+//! stand up a cosine-similarity-plus-threshold scan on top of a prepared data array:
+//!
+//! - [`compress_turboquant`] applies the canonical TurboQuant encoding pipeline
+//!   (`L2Denorm(SorfTransform(FSL(Dict(codes, centroids))), norms)`) to a raw
+//!   `Vector<dim, f32>` array without requiring the caller to plumb the
+//!   `unstable_encodings` feature flag on the `vortex` facade.
+//! - [`build_constant_query_vector`] wraps a single query vector into a
+//!   [`Vector`] extension array whose storage is a [`ConstantArray`] broadcast
+//!   across `num_rows` rows. This is the shape expected by
+//!   [`CosineSimilarity::try_new_array`] for the RHS of a database-vs-query scan.
+//! - [`build_similarity_search_tree`] wires everything together into a lazy
+//!   `Binary(Gt, [CosineSimilarity(data, query), threshold])` expression.
+//!
+//! Executing the tree from [`build_similarity_search_tree`] into a
+//! [`BoolArray`](vortex_array::arrays::BoolArray) yields one boolean per row indicating whether
+//! that row's cosine similarity to the query exceeds `threshold`.
+//!
+//! # Example
+//!
+//! ```ignore
+//! use vortex_array::{ArrayRef, VortexSessionExecute};
+//! use vortex_array::arrays::BoolArray;
+//! use vortex_session::VortexSession;
+//! use vortex_tensor::vector_search::{build_similarity_search_tree, compress_turboquant};
+//!
+//! fn run(session: &VortexSession, data: ArrayRef, query: &[f32]) -> anyhow::Result<()> {
+//!     let mut ctx = session.create_execution_ctx();
+//!     let data = compress_turboquant(data, &mut ctx)?;
+//!     let tree = build_similarity_search_tree(data, query, 0.8)?;
+//!     let _matches: BoolArray = tree.execute(&mut ctx)?;
+//!     Ok(())
+//! }
+//! ```
+//!
+//! [`Vector`]: crate::vector::Vector
+//! [`CosineSimilarity::try_new_array`]: crate::scalar_fns::cosine_similarity::CosineSimilarity::try_new_array
+
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::arrays::ConstantArray;
+use vortex_array::arrays::Extension;
+use vortex_array::arrays::ExtensionArray;
+use vortex_array::arrays::scalar_fn::ScalarFnArrayExt;
+use vortex_array::builtins::ArrayBuiltins;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::Nullability;
+use vortex_array::dtype::PType;
+use vortex_array::dtype::extension::ExtDType;
+use vortex_array::extension::EmptyMetadata;
+use vortex_array::scalar::Scalar;
+use vortex_array::scalar_fn::fns::operators::Operator;
+use vortex_error::VortexResult;
+use vortex_error::vortex_bail;
+
+use crate::encodings::turboquant::TurboQuantConfig;
+use crate::encodings::turboquant::turboquant_encode_unchecked;
+use crate::scalar_fns::cosine_similarity::CosineSimilarity;
+use crate::scalar_fns::l2_denorm::L2Denorm;
+use crate::scalar_fns::l2_denorm::normalize_as_l2_denorm;
+use crate::vector::Vector;
+
+/// Apply the canonical TurboQuant encoding pipeline to a `Vector<dim, f32>` array.
+///
+/// The returned array has the shape
+/// `L2Denorm(SorfTransform(FSL(Dict(codes, centroids))), norms)` — exactly what
+/// [`vortex_tensor::encodings::turboquant::TurboQuantScheme`] produces when invoked through
+/// `BtrBlocksCompressorBuilder::with_turboquant()`, but without requiring callers to enable
+/// the `unstable_encodings` feature on the `vortex` facade.
+///
+/// The input `data` must be a [`Vector`] extension array whose element type is `f32` and whose
+/// dimensionality is at least
+/// [`turboquant::MIN_DIMENSION`](crate::encodings::turboquant::MIN_DIMENSION). The TurboQuant
+/// configuration used is [`TurboQuantConfig::default()`] (8-bit codes, 3 SORF rounds, seed 42).
+///
+/// # Errors
+///
+/// Returns an error if `data` is not a [`Vector`] extension array, if normalization fails, or
+/// if the underlying TurboQuant encoder rejects the input shape.
+pub fn compress_turboquant(data: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef> {
+    let l2_denorm = normalize_as_l2_denorm(data, ctx)?;
+    let normalized = l2_denorm.child_at(0).clone();
+    let norms = l2_denorm.child_at(1).clone();
+    let num_rows = l2_denorm.len();
+
+    let Some(normalized_ext) = normalized.as_opt::<Extension>() else {
+        vortex_bail!("normalize_as_l2_denorm must produce an Extension array child");
+    };
+
+    let config = TurboQuantConfig::default();
+    // SAFETY: `normalize_as_l2_denorm` guarantees every row is unit-norm (or zero), which is
+    // the invariant `turboquant_encode_unchecked` expects.
+    let tq = unsafe { turboquant_encode_unchecked(normalized_ext, &config, ctx) }?;
+
+    Ok(unsafe { L2Denorm::new_array_unchecked(tq, norms, num_rows) }?.into_array())
+}
+
+/// Build a `Vector<dim, f32>` extension array whose storage is a [`ConstantArray`] broadcasting
+/// a single query vector across `num_rows` rows.
+///
+/// This is the shape expected for the RHS of a database-vs-query
+/// [`CosineSimilarity`](crate::scalar_fns::cosine_similarity::CosineSimilarity) scan: the
+/// `ScalarFnArray` contract requires both children to have the same length, so rather than
+/// hand-rolling a 1-row input we broadcast the query across the whole database.
+///
+/// # Errors
+///
+/// Returns an error if the [`Vector`] extension dtype rejects the constructed storage dtype.
+pub fn build_constant_query_vector(query: &[f32], num_rows: usize) -> VortexResult<ArrayRef> {
+    let element_dtype = DType::Primitive(PType::F32, Nullability::NonNullable);
+
+    let children: Vec<Scalar> = query
+        .iter()
+        .map(|&v| Scalar::primitive(v, Nullability::NonNullable))
+        .collect();
+    let storage_scalar = Scalar::fixed_size_list(element_dtype, children, Nullability::NonNullable);
+
+    let storage = ConstantArray::new(storage_scalar, num_rows).into_array();
+
+    let ext_dtype = ExtDType::<Vector>::try_new(EmptyMetadata, storage.dtype().clone())?.erased();
+    Ok(ExtensionArray::new(ext_dtype, storage).into_array())
+}
+
+/// Build the lazy similarity-search expression tree for a prepared database array and a
+/// single query vector.
+///
+/// The returned array is a lazy boolean expression of length `data.len()` whose position `i`
+/// is `true` iff `cosine_similarity(data[i], query) > threshold`. Executing it into a
+/// [`BoolArray`](vortex_array::arrays::BoolArray) runs the full scan.
+///
+/// The tree shape is:
+///
+/// ```text
+/// Binary(Gt, [
+///     CosineSimilarity([data, ConstantArray(query_vec, n)]),
+///     ConstantArray(threshold, n),
+/// ])
+/// ```
+///
+/// This function performs no execution; it is safe to call inside a benchmark setup closure.
+///
+/// # Errors
+///
+/// Returns an error if `query` has a length incompatible with `data`'s vector dimension, or
+/// if any of the intermediate array constructors fails.
+pub fn build_similarity_search_tree(
+    data: ArrayRef,
+    query: &[f32],
+    threshold: f32,
+) -> VortexResult<ArrayRef> {
+    let num_rows = data.len();
+    let query_vec = build_constant_query_vector(query, num_rows)?;
+
+    let cosine = CosineSimilarity::try_new_array(data, query_vec, num_rows)?.into_array();
+
+    let threshold_scalar = Scalar::primitive(threshold, Nullability::NonNullable);
+    let threshold_array = ConstantArray::new(threshold_scalar, num_rows).into_array();
+
+    cosine.binary(threshold_array, Operator::Gt)
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_array::ArrayRef;
+    use vortex_array::IntoArray;
+    use vortex_array::VortexSessionExecute;
+    use vortex_array::arrays::BoolArray;
+    use vortex_array::arrays::Extension;
+    use vortex_array::arrays::ExtensionArray;
+    use vortex_array::arrays::FixedSizeListArray;
+    use vortex_array::arrays::PrimitiveArray;
+    use vortex_array::arrays::bool::BoolArrayExt;
+    use vortex_array::dtype::extension::ExtDType;
+    use vortex_array::extension::EmptyMetadata;
+    use vortex_array::session::ArraySession;
+    use vortex_array::validity::Validity;
+    use vortex_buffer::BufferMut;
+    use vortex_error::VortexResult;
+    use vortex_session::VortexSession;
+
+    use super::build_constant_query_vector;
+    use super::build_similarity_search_tree;
+    use super::compress_turboquant;
+    use crate::vector::Vector;
+
+    /// Build a `Vector<DIM, f32>` extension array from a flat f32 slice. Each contiguous
+    /// group of `DIM` values becomes one row.
+    fn vector_array(dim: u32, values: &[f32]) -> VortexResult<ArrayRef> {
+        let dim_usize = dim as usize;
+        assert_eq!(values.len() % dim_usize, 0);
+        let num_rows = values.len() / dim_usize;
+
+        let mut buf = BufferMut::<f32>::with_capacity(values.len());
+        for &v in values {
+            buf.push(v);
+        }
+        let elements = PrimitiveArray::new::<f32>(buf.freeze(), Validity::NonNullable);
+        let fsl = FixedSizeListArray::try_new(
+            elements.into_array(),
+            dim,
+            Validity::NonNullable,
+            num_rows,
+        )?;
+
+        let ext_dtype = ExtDType::<Vector>::try_new(EmptyMetadata, fsl.dtype().clone())?.erased();
+        Ok(ExtensionArray::new(ext_dtype, fsl.into_array()).into_array())
+    }
+
+    fn test_session() -> VortexSession {
+        VortexSession::empty().with::<ArraySession>()
+    }
+
+    #[test]
+    fn constant_query_vector_has_vector_extension_dtype() -> VortexResult<()> {
+        let query = vec![1.0f32, 0.0, 0.0, 0.0];
+        let rhs = build_constant_query_vector(&query, 5)?;
+
+        assert_eq!(rhs.len(), 5);
+        assert!(rhs.as_opt::<Extension>().is_some());
+        Ok(())
+    }
+
+    #[test]
+    fn similarity_search_tree_executes_to_bool_array() -> VortexResult<()> {
+        // 4 rows of 3-dim vectors; the first and last match the query [1, 0, 0].
+        let data = vector_array(
+            3,
+            &[
+                1.0, 0.0, 0.0, //
+                0.0, 1.0, 0.0, //
+                0.0, 0.0, 1.0, //
+                1.0, 0.0, 0.0, //
+            ],
+        )?;
+        let query = [1.0f32, 0.0, 0.0];
+
+        let tree = build_similarity_search_tree(data, &query, 0.5)?;
+        let mut ctx = test_session().create_execution_ctx();
+        let result: BoolArray = tree.execute(&mut ctx)?;
+
+        let bits = result.to_bit_buffer();
+        assert_eq!(bits.len(), 4);
+        assert!(bits.value(0));
+        assert!(!bits.value(1));
+        assert!(!bits.value(2));
+        assert!(bits.value(3));
+        Ok(())
+    }
+
+    #[test]
+    fn turboquant_roundtrip_preserves_ranking() -> VortexResult<()> {
+        // Build 6 rows of 128-dim vectors where row 0 is highly correlated with the query.
+        // TurboQuant should preserve the "row 0 is the best match" ordering.
+        const DIM: u32 = 128;
+        const NUM_ROWS: usize = 6;
+
+        let mut values = Vec::<f32>::with_capacity(NUM_ROWS * DIM as usize);
+        let query: Vec<f32> = (0..DIM as usize)
+            .map(|i| ((i as f32) * 0.017).sin())
+            .collect();
+
+        // Row 0: identical to query (cosine=1.0)
+        values.extend_from_slice(&query);
+        // Row 1: query + noise
+        for (i, q) in query.iter().enumerate() {
+            values.push(q + 0.05 * ((i as f32) * 0.03).cos());
+        }
+        // Rows 2..6: unrelated patterns
+        for row in 2..NUM_ROWS {
+            for i in 0..DIM as usize {
+                values.push(((row as f32 * 1.3 + i as f32) * 0.07).sin());
+            }
+        }
+
+        let data = vector_array(DIM, &values)?;
+        let mut ctx = test_session().create_execution_ctx();
+        let compressed = compress_turboquant(data, &mut ctx)?;
+        assert_eq!(compressed.len(), NUM_ROWS);
+
+        // Build a tree with a low threshold so row 0 (cosine=1.0 exact) matches.
+        let tree = build_similarity_search_tree(compressed, &query, 0.95)?;
+        let result: BoolArray = tree.execute(&mut ctx)?;
+        let bits = result.to_bit_buffer();
+        assert_eq!(bits.len(), NUM_ROWS);
+        assert!(
+            bits.value(0),
+            "row 0 (identical to query) must match at threshold 0.95 even after TurboQuant"
+        );
+        Ok(())
+    }
+}

From fbf79f3d8a376ecffb449af740d694b93f0fb3d3 Mon Sep 17 00:00:00 2001
From: Connor Tsui <connor.tsui20@gmail.com>
Date: Sat, 11 Apr 2026 18:47:01 -0400
Subject: [PATCH 2/3] fix nullability

Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 .../src/handrolled_baseline.rs                |  57 ++++--
 vortex-bench/src/conversions.rs               | 174 ++++++++++++++----
 2 files changed, 186 insertions(+), 45 deletions(-)

diff --git a/benchmarks/vector-search-bench/src/handrolled_baseline.rs b/benchmarks/vector-search-bench/src/handrolled_baseline.rs
index f62c5198fb6..2a22f630ab8 100644
--- a/benchmarks/vector-search-bench/src/handrolled_baseline.rs
+++ b/benchmarks/vector-search-bench/src/handrolled_baseline.rs
@@ -49,7 +49,10 @@ use anyhow::bail;
 use arrow_array::Array;
 use arrow_array::FixedSizeListArray;
 use arrow_array::Float32Array;
+use arrow_array::GenericListArray;
+use arrow_array::LargeListArray;
 use arrow_array::ListArray;
+use arrow_array::OffsetSizeTrait;
 use arrow_array::RecordBatch;
 use arrow_array::cast::AsArray;
 use arrow_schema::DataType;
@@ -134,21 +137,15 @@ fn append_batch(
         return Ok(());
     }
 
+    // `ListArray` and `LargeListArray` are both `GenericListArray<O>`, differing
+    // only in their offset width (`i32` vs `i64`). Real VectorDBBench parquet files
+    // canonicalize to `LargeList<f32>` on read, while some smaller test fixtures
+    // still arrive as `List<f32>` — handle both through the same generic helper.
     if let Some(list) = column.as_any().downcast_ref::<ListArray>() {
-        let values: &Float32Array = list
-            .values()
-            .as_primitive_opt::<arrow_array::types::Float32Type>()
-            .context("List emb column must have Float32 values")?;
-        let offsets = list.value_offsets();
-        for i in 0..list.len() {
-            let start = offsets[i] as usize;
-            let end = offsets[i + 1] as usize;
-            let row_len = end - start;
-            maybe_set_dim(inferred_dim, row_len)?;
-            data.extend_from_slice(&values.values()[start..end]);
-            *num_rows += 1;
-        }
-        return Ok(());
+        return append_generic_list(list, data, inferred_dim, num_rows);
+    }
+    if let Some(list) = column.as_any().downcast_ref::<LargeListArray>() {
+        return append_generic_list(list, data, inferred_dim, num_rows);
     }
 
     bail!(
@@ -157,6 +154,38 @@ fn append_batch(
     );
 }
 
+/// Flatten a `GenericListArray<O>` of `Float32` values into `data`.
+///
+/// The offsets are used purely to validate that every row has the same length;
+/// they're effectively discarded on the output side, since flattening `N` uniform
+/// rows of length `dim` into one contiguous `Vec<f32>` just yields the total span
+/// `values[first_offset..last_offset]` regardless of how that span is sliced by
+/// per-row offsets. That's why this helper is generic over `OffsetSizeTrait` and
+/// works verbatim for both `ListArray` (i32 offsets) and `LargeListArray` (i64
+/// offsets) — the only difference between the two is how wide an integer we cast
+/// to `usize`, which `OffsetSizeTrait::as_usize` handles for us.
+fn append_generic_list<O: OffsetSizeTrait>(
+    list: &GenericListArray<O>,
+    data: &mut Vec<f32>,
+    inferred_dim: &mut Option<usize>,
+    num_rows: &mut usize,
+) -> Result<()> {
+    let values: &Float32Array = list
+        .values()
+        .as_primitive_opt::<arrow_array::types::Float32Type>()
+        .context("List emb column must have Float32 values")?;
+    let offsets = list.value_offsets();
+    for i in 0..list.len() {
+        let start = offsets[i].as_usize();
+        let end = offsets[i + 1].as_usize();
+        let row_len = end - start;
+        maybe_set_dim(inferred_dim, row_len)?;
+        data.extend_from_slice(&values.values()[start..end]);
+        *num_rows += 1;
+    }
+    Ok(())
+}
+
 fn maybe_set_dim(inferred_dim: &mut Option<usize>, new_dim: usize) -> Result<()> {
     match inferred_dim {
         Some(d) if *d == new_dim => Ok(()),
diff --git a/vortex-bench/src/conversions.rs b/vortex-bench/src/conversions.rs
index 9811abc903d..267c9633512 100644
--- a/vortex-bench/src/conversions.rs
+++ b/vortex-bench/src/conversions.rs
@@ -27,6 +27,8 @@ use vortex::array::arrays::ExtensionArray;
 use vortex::array::arrays::FixedSizeListArray;
 use vortex::array::arrays::List;
 use vortex::array::arrays::ListView;
+use vortex::array::arrays::Primitive;
+use vortex::array::arrays::PrimitiveArray;
 use vortex::array::arrays::chunked::ChunkedArrayExt;
 use vortex::array::arrays::list::ListArrayExt;
 use vortex::array::arrays::listview::recursive_list_from_list_view;
@@ -253,7 +255,15 @@ pub async fn write_parquet_as_vortex(
 ///
 /// Returns an error if:
 /// - `input` is not a `List` or `Chunked` array.
-/// - The element type is not a non-nullable float primitive (`f16`, `f32`, or `f64`).
+/// - The element type is not a float primitive (`f16`, `f32`, or `f64`). A nullable
+///   element dtype (`List<f32?>`) is accepted as long as the runtime validity is
+///   `NonNullable` or `AllValid`: parquet has no non-nullable-element list logical
+///   type, so arrow-rs always marks list-of-float element fields as nullable on read
+///   regardless of whether any element is actually missing. In that case the elements
+///   are rewrapped as non-nullable before being embedded in the FSL.
+/// - The element dtype is nullable *and* any element is actually null — i.e.,
+///   `Validity::AllInvalid` or any `Validity::Array` mask. Vector extension elements
+///   must be non-null.
 /// - Any row has a different length than the first row.
 /// - The list validity is nullable (vector elements cannot be null at the row level).
 /// - The input has zero rows (the dimension cannot be inferred from empty input).
@@ -305,12 +315,10 @@ pub fn list_to_vector_ext(input: ArrayRef) -> VortexResult<ArrayRef> {
             ptype
         );
     }
-    if elem_nullability.is_nullable() {
-        vortex_bail!(
-            "list_to_vector_ext: element type must be non-nullable, got nullable {}",
-            ptype
-        );
-    }
+
+    // Extract the flat elements buffer up front: the nullable-handling branch below
+    // needs to inspect runtime validity before we can decide whether to rewrap it.
+    let raw_elements = list.sliced_elements()?;
 
     let num_rows = input.len();
     if num_rows == 0 {
@@ -349,18 +357,58 @@ pub fn list_to_vector_ext(input: ArrayRef) -> VortexResult<ArrayRef> {
         prev_end = end;
     }
 
-    let elements = list.sliced_elements()?;
     let expected_elements = num_rows
         .checked_mul(dim)
         .ok_or_else(|| vortex_err!("list_to_vector_ext: num_rows * dim overflows usize"))?;
-    if elements.len() != expected_elements {
+    if raw_elements.len() != expected_elements {
         vortex_bail!(
             "list_to_vector_ext: elements buffer has length {} but expected {}",
-            elements.len(),
+            raw_elements.len(),
             expected_elements
         );
     }
 
+    // Parquet has no non-nullable-element list logical type, so arrow-rs marks every
+    // `List<float>`'s element field as nullable on read regardless of what the writer
+    // intended. That propagates through `DType::from_arrow`, so every real embedding
+    // parquet file arrives shaped as `List<f32?>` even when every value is present.
+    // A nullable element dtype is losslessly convertible to a non-nullable FSL as
+    // long as the runtime validity is `NonNullable`/`AllValid`; we must only reject
+    // when a real null is present.
+    let elements = if elem_nullability.is_nullable() {
+        let primitive = raw_elements.as_opt::<Primitive>().ok_or_else(|| {
+            vortex_err!(
+                "list_to_vector_ext: expected nullable-float elements to downcast to \
+                 Primitive, got dtype {}",
+                raw_elements.dtype()
+            )
+        })?;
+        match primitive.validity()? {
+            Validity::NonNullable | Validity::AllValid => {
+                // `to_host_sync` is a no-op for host-resident buffers, so this is a
+                // metadata change (rebuilding the array with a non-nullable dtype),
+                // not a data copy.
+                let byte_buffer = primitive.buffer_handle().to_host_sync();
+                PrimitiveArray::from_byte_buffer(byte_buffer, *ptype, Validity::NonNullable)
+                    .into_array()
+            }
+            Validity::AllInvalid => {
+                vortex_bail!(
+                    "list_to_vector_ext: list has nullable element dtype with all-invalid \
+                     elements; Vector extension elements must be non-null"
+                );
+            }
+            Validity::Array(_) => {
+                vortex_bail!(
+                    "list_to_vector_ext: list has nullable element dtype with one or more \
+                     actual null elements; Vector extension elements must be non-null"
+                );
+            }
+        }
+    } else {
+        raw_elements
+    };
+
     let dim_u32 = u32::try_from(dim)
         .map_err(|_| vortex_err!("list_to_vector_ext: dimension {dim} does not fit in u32"))?;
 
@@ -372,6 +420,7 @@ pub fn list_to_vector_ext(input: ArrayRef) -> VortexResult<ArrayRef> {
 #[cfg(test)]
 mod tests {
     use vortex::array::IntoArray;
+    use vortex::array::arrays::BoolArray;
     use vortex::array::arrays::Extension;
     use vortex::array::arrays::List;
     use vortex::array::arrays::ListViewArray;
@@ -383,6 +432,35 @@ mod tests {
 
     use super::list_to_vector_ext;
 
+    /// Build a `List<f32>` whose elements carry the given [`Validity`]. Passing
+    /// `Validity::NonNullable` produces a `List<f32>`; any other variant produces
+    /// a `List<f32?>`, matching the shape `parquet_to_vortex_chunks` produces for
+    /// embedding columns after arrow-rs' canonicalization.
+    fn list_f32_with_element_validity(
+        values: &[f32],
+        dim: usize,
+        element_validity: Validity,
+    ) -> vortex::array::ArrayRef {
+        assert_eq!(
+            values.len() % dim,
+            0,
+            "values.len() must be a multiple of dim"
+        );
+        let num_rows = values.len() / dim;
+        let elements = PrimitiveArray::new::<f32>(
+            BufferMut::<f32>::from_iter(values.iter().copied()).freeze(),
+            element_validity,
+        )
+        .into_array();
+        let mut offsets_buf = BufferMut::<i32>::with_capacity(num_rows + 1);
+        for i in 0..=num_rows {
+            offsets_buf.push(i32::try_from(i * dim).unwrap());
+        }
+        let offsets =
+            PrimitiveArray::new::<i32>(offsets_buf.freeze(), Validity::NonNullable).into_array();
+        vortex::array::Array::<List>::new(elements, offsets, Validity::NonNullable).into_array()
+    }
+
     fn list_f32(rows: &[&[f32]]) -> vortex::array::ArrayRef {
         let mut elements = BufferMut::<f32>::with_capacity(rows.iter().map(|r| r.len()).sum());
         let mut offsets = BufferMut::<i32>::with_capacity(rows.len() + 1);
@@ -551,30 +629,64 @@ mod tests {
     }
 
     #[test]
-    fn nullable_element_dtype_is_rejected() {
-        // Build a `List<f32?>` — a list whose elements have nullable dtype (even
-        // if every value happens to be present). The `Vector` extension type at the
-        // FSL level requires non-nullable elements, so this must be rejected.
-        //
-        // Passing `Validity::AllValid` to `PrimitiveArray::new` sets the ptype's
-        // nullability to `Nullable`, which is what triggers the rejection path even
-        // though every value is technically valid.
-        let elements = PrimitiveArray::new::<f32>(
-            BufferMut::<f32>::from_iter([1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0]).freeze(),
-            Validity::AllValid,
-        )
-        .into_array();
-        let offsets = PrimitiveArray::new::<i32>(
-            BufferMut::<i32>::from_iter([0i32, 3, 6]).freeze(),
-            Validity::NonNullable,
-        )
-        .into_array();
-        let list = vortex::array::Array::<List>::new(elements, offsets, Validity::NonNullable)
-            .into_array();
+    fn nullable_elements_with_real_nulls_are_rejected() {
+        // A `List<f32?>` whose elements carry a real `Validity::Array` mask with
+        // at least one `false` bit has one or more actually-missing values. The
+        // rejection here is about runtime nulls, not dtype metadata: a nullable
+        // element dtype with all-valid runtime validity is accepted (see
+        // `nullable_element_dtype_with_all_valid_elements_is_accepted`), because
+        // parquet-ingested embeddings always arrive shaped that way even when
+        // every value is present. A real null, on the other hand, cannot be
+        // represented in the Vector extension FSL and must be rejected rather
+        // than silently dropped.
+        let element_validity = Validity::Array(
+            BoolArray::from_iter([true, true, false, true, true, true]).into_array(),
+        );
+        let list =
+            list_f32_with_element_validity(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0], 3, element_validity);
+
+        let err = list_to_vector_ext(list).unwrap_err().to_string();
+        assert!(
+            err.contains("one or more actual null elements"),
+            "unexpected error: {err}"
+        );
+    }
+
+    #[test]
+    fn nullable_element_dtype_with_all_valid_elements_is_accepted() {
+        // This is the regression test for the Cohere parquet case: every real
+        // VectorDBBench parquet file arrives as `List<f32?>` with
+        // `Validity::AllValid` elements because parquet has no non-nullable
+        // list-element logical type and arrow-rs propagates the nullable bit
+        // through `DType::from_arrow`. `list_to_vector_ext` must accept this
+        // shape by rewrapping the elements as non-nullable before building the
+        // FSL, rather than rejecting outright on the dtype metadata.
+        let list =
+            list_f32_with_element_validity(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0], 3, Validity::AllValid);
+
+        let wrapped = list_to_vector_ext(list).unwrap();
+        assert_eq!(wrapped.len(), 2);
+        let ext = wrapped.as_opt::<Extension>().expect("returns Extension");
+        assert!(matches!(
+            ext.storage_array().dtype(),
+            DType::FixedSizeList(_, 3, _)
+        ));
+    }
+
+    #[test]
+    fn nullable_element_dtype_with_all_invalid_elements_is_rejected() {
+        // A `List<f32?>` whose elements are `Validity::AllInvalid` means every
+        // value is missing. Rewrapping as non-nullable would silently drop the
+        // validity and produce bogus vectors, so this must be rejected.
+        let list = list_f32_with_element_validity(
+            &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+            3,
+            Validity::AllInvalid,
+        );
 
         let err = list_to_vector_ext(list).unwrap_err().to_string();
         assert!(
-            err.contains("element type must be non-nullable"),
+            err.contains("all-invalid elements"),
             "unexpected error: {err}"
         );
     }

From c37689c1a7b6661337a2af28009b77dd7a9e4a4e Mon Sep 17 00:00:00 2001
From: Connor Tsui <connor.tsui20@gmail.com>
Date: Sat, 11 Apr 2026 19:45:58 -0400
Subject: [PATCH 3/3] fix display and move stuff around

Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 Cargo.lock                                    |   1 +
 benchmarks/vector-search-bench/Cargo.toml     |   1 +
 benchmarks/vector-search-bench/src/display.rs | 513 ++++++++++++++++++
 .../src/handrolled_baseline.rs                | 105 ++++
 benchmarks/vector-search-bench/src/lib.rs     |   1 +
 benchmarks/vector-search-bench/src/main.rs    | 164 +++---
 benchmarks/vector-search-bench/src/verify.rs  |   2 +-
 vortex-tensor/src/vector_search.rs            |   5 +-
 8 files changed, 697 insertions(+), 95 deletions(-)
 create mode 100644 benchmarks/vector-search-bench/src/display.rs

diff --git a/Cargo.lock b/Cargo.lock
index aba600c1237..e3182d5864f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10043,6 +10043,7 @@ dependencies = [
  "clap",
  "indicatif",
  "parquet 58.0.0",
+ "tabled",
  "tempfile",
  "tokio",
  "tracing",
diff --git a/benchmarks/vector-search-bench/Cargo.toml b/benchmarks/vector-search-bench/Cargo.toml
index bbc620572b5..0300d8b8e4a 100644
--- a/benchmarks/vector-search-bench/Cargo.toml
+++ b/benchmarks/vector-search-bench/Cargo.toml
@@ -22,6 +22,7 @@ arrow-schema = { workspace = true }
 clap = { workspace = true, features = ["derive"] }
 indicatif = { workspace = true }
 parquet = { workspace = true }
+tabled = { workspace = true, features = ["std"] }
 tokio = { workspace = true, features = ["full"] }
 tracing = { workspace = true }
 vortex = { workspace = true }
diff --git a/benchmarks/vector-search-bench/src/display.rs b/benchmarks/vector-search-bench/src/display.rs
new file mode 100644
index 00000000000..c49b7feecae
--- /dev/null
+++ b/benchmarks/vector-search-bench/src/display.rs
@@ -0,0 +1,513 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Variant-major table renderer for `vector-search-bench`.
+//!
+//! Unlike `compress-bench` and `random-access-bench` — which can lean on the
+//! shared [`vortex_bench::display::render_table`] because their measurements
+//! are keyed by `(engine, format)` and every column has the same set of row
+//! names — `vector-search-bench` runs multiple "variants" that collapse to
+//! the same [`vortex_bench::Format`] enum value (e.g. `vortex-uncompressed`
+//! and `vortex-default` both report as `Format::OnDiskVortex`). The generic
+//! helper would merge them into one column, so this module renders a table
+//! keyed by **variant label** instead of `Target`.
+//!
+//! The output shape deliberately mirrors
+//! [`vortex_bench::display::render_table`]: one column per variant (in the
+//! order the user passed on `--formats`, with the first variant that
+//! actually has a value serving as the ratio baseline for each row),
+//! `tabled::Style::modern`, and per-cell green/yellow/red coloring based on
+//! the ratio against the baseline.
+//!
+//! This module is only used for the `-d table` developer-inspection path.
+//! The `-d gh-json` path (the one CI consumes) lives entirely in
+//! [`vortex_bench::display::print_measurements_json`] and the `ToJson`
+//! impls on the measurement structs, both of which are untouched.
+
+use std::io::Write;
+
+use anyhow::Result;
+use tabled::builder::Builder;
+use tabled::settings::Color;
+use tabled::settings::Style;
+use tabled::settings::themes::Colorization;
+use vortex_bench::measurements::CompressionTimingMeasurement;
+use vortex_bench::measurements::CustomUnitMeasurement;
+
+/// How a row's raw `f64` values are rendered in a cell.
+///
+/// This axis is "the physical quantity being reported", not "the metric
+/// name" — multiple metric names may share a [`RowFormat`]. The ratio
+/// calculation always runs over the raw `f64`, so ratios stay comparable
+/// regardless of how the value is displayed.
+#[derive(Clone, Copy, Debug)]
+pub enum RowFormat {
+    /// Raw value is nanoseconds; render as `"{x:.2} ms"`.
+    DurationNanos,
+    /// Raw value is bytes; render as `"{x:.2} MB"`.
+    Bytes,
+    /// Dimensionless absolute difference; render in scientific notation.
+    AbsDiff,
+    /// Recall fraction in `[0, 1]`; render with four decimal places.
+    Recall,
+}
+
+impl RowFormat {
+    fn format(self, value: f64) -> String {
+        match self {
+            Self::DurationNanos => format!("{:.2} ms", value / 1_000_000.0),
+            Self::Bytes => format!("{:.2} MB", value / (1024.0 * 1024.0)),
+            Self::AbsDiff => format!("{value:.2e}"),
+            Self::Recall => format!("{value:.4}"),
+        }
+    }
+}
+
+/// One row of the rendered table: a label plus one cell per variant column.
+///
+/// `cells` is parallel to the `variant_labels` slice passed to
+/// [`render_variants_table`]; `None` means "no measurement for this variant"
+/// (e.g. the `handrolled` variant has no `compress time`) and renders as an
+/// empty `-` cell.
+#[derive(Clone, Debug)]
+pub struct Row {
+    /// Row label, typically `"{metric_kind}/{dataset_name}"`.
+    pub label: String,
+    /// How to render the raw `f64` values in this row's cells.
+    pub format: RowFormat,
+    /// One entry per variant column, in the same order as `variant_labels`.
+    pub cells: Vec<Option<f64>>,
+}
+
+/// Render a variant-major table of `rows` to `writer`.
+///
+/// `variant_labels` fixes the column order and must match the cell order in
+/// every [`Row`]. `preferred_baseline` names the variant whose column should
+/// be used as the ratio baseline: when the named column has a value for a
+/// given row, that value is the baseline; otherwise the function falls back
+/// to the first column whose cell is `Some`. `preferred_baseline = None`
+/// (or a label that isn't in `variant_labels`) produces pure first-non-None
+/// behavior.
+///
+/// Baseline per row rather than per table matters because some rows have a
+/// legitimate missing-cell pattern: e.g. `compress time` has no value for
+/// `handrolled`, so even with `preferred_baseline = Some("handrolled")`
+/// that row has to fall back to a non-handrolled column.
+///
+/// The styling (`Style::modern`, three-tier ratio coloring) matches
+/// [`vortex_bench::display::render_table`] so a human looking at both
+/// tables side-by-side sees the same formatting conventions.
+pub fn render_variants_table<W: Write>(
+    writer: &mut W,
+    variant_labels: &[String],
+    rows: &[Row],
+    preferred_baseline: Option<&str>,
+) -> Result<()> {
+    // Resolve the preferred baseline label to a column index once up front;
+    // if the user passed a variant that isn't in this run's column set we
+    // just degrade to "first non-None" for every row.
+    let preferred_col =
+        preferred_baseline.and_then(|label| variant_labels.iter().position(|v| v == label));
+
+    let mut builder = Builder::default();
+    builder.push_record(
+        std::iter::once("Benchmark".to_owned())
+            .chain(variant_labels.iter().cloned())
+            .collect::<Vec<_>>(),
+    );
+
+    let mut colors = Vec::new();
+    for (row_idx, row) in rows.iter().enumerate() {
+        // Header is row 0, so data rows start at row_idx + 1.
+        let table_row_idx = row_idx + 1;
+
+        // Prefer the caller-supplied baseline column for this row if it has
+        // a value; otherwise fall back to the first column that does.
+        let baseline_col = preferred_col
+            .filter(|&c| row.cells.get(c).copied().flatten().is_some())
+            .or_else(|| row.cells.iter().position(Option::is_some));
+        let baseline_value = baseline_col.and_then(|i| row.cells[i]);
+
+        let mut record = vec![row.label.clone()];
+        for (col_idx, cell) in row.cells.iter().enumerate() {
+            match (cell, baseline_value) {
+                (None, _) => record.push("-".to_owned()),
+                (Some(value), Some(baseline))
+                    if Some(col_idx) != baseline_col && baseline > 0.0 =>
+                {
+                    let ratio = value / baseline;
+                    // Data columns are offset by 1 because of the label column.
+                    colors.push(Colorization::exact(
+                        vec![ratio_color(ratio)],
+                        (table_row_idx, col_idx + 1),
+                    ));
+                    record.push(format!("{} ({:.2})", row.format.format(*value), ratio));
+                }
+                (Some(value), _) => record.push(row.format.format(*value)),
+            }
+        }
+        builder.push_record(record);
+    }
+
+    let mut table = builder.build();
+    table.with(Style::modern());
+    for color in colors {
+        table.with(color);
+    }
+
+    writeln!(writer, "{table}")?;
+    Ok(())
+}
+
+/// Three-tier coloring matching `vortex_bench::display::color`.
+///
+/// Green for "within 10% of baseline" (likely equal or better accounting
+/// for noise), yellow for "up to 50% slower", red for "more than 50%
+/// slower". Only applied to non-baseline cells with a computed ratio.
+fn ratio_color(ratio: f64) -> Color {
+    if ratio > 1.5 {
+        Color::BG_RED | Color::FG_BLACK
+    } else if ratio > 1.1 {
+        Color::BG_YELLOW | Color::FG_BLACK
+    } else {
+        Color::BG_BRIGHT_GREEN | Color::FG_BLACK
+    }
+}
+
+/// Assemble the [`Row`] list fed into [`render_variants_table`] by walking
+/// the already-collected measurement vecs.
+///
+/// This function is deliberately **additive over** the measurement-push
+/// code in `main.rs` — it doesn't touch how measurements are collected,
+/// serialized, or shipped to CI. Row-cell lookup reconstructs the exact
+/// metric-name strings the push-side code uses so there's no fragile
+/// substring parsing: we already know the grammar because we pass the
+/// same `variant_label` strings both to the push side and to this
+/// builder. A row whose cells are all `None` for every variant is
+/// suppressed via `push_row_if_any` (e.g. the `recall@k` row when
+/// `--recall-queries 0`, or the `compress time` row when no vortex
+/// variant is active).
+pub fn build_table_rows(
+    variant_labels: &[String],
+    dataset_names: &[String],
+    recall_k: usize,
+    timings: &[CompressionTimingMeasurement],
+    sizes: &[CustomUnitMeasurement],
+    recalls: &[CustomUnitMeasurement],
+    verification: &[CustomUnitMeasurement],
+) -> Vec<Row> {
+    let mut rows = Vec::new();
+
+    for dataset in dataset_names {
+        // Size — `handrolled` labels its size as `"{variant} size/{dataset}"`
+        // (parquet bytes on disk) while vortex variants use
+        // `"{variant} nbytes/{dataset}"` (in-memory nbytes). They're
+        // different quantities but both represent "how big is this
+        // variant", so they share one row.
+        let size_cells: Vec<Option<f64>> = variant_labels
+            .iter()
+            .map(|variant| {
+                let disk = format!("{variant} size/{dataset}");
+                let nbytes = format!("{variant} nbytes/{dataset}");
+                sizes
+                    .iter()
+                    .find(|m| m.name == disk || m.name == nbytes)
+                    .map(|m| m.value)
+            })
+            .collect();
+        push_row_if_any(
+            &mut rows,
+            format!("size/{dataset}"),
+            RowFormat::Bytes,
+            size_cells,
+        );
+
+        // Timing rows — same `{metric} time/{variant}/{dataset}` grammar
+        // for every timing metric. `compress time` is legitimately absent
+        // for `handrolled`; `push_row_if_any` handles the fully-missing
+        // case (e.g. `--formats handrolled` alone) by skipping the row.
+        for metric in [
+            "compress",
+            "decompress",
+            "cosine-similarity",
+            "cosine-filter",
+        ] {
+            let cells: Vec<Option<f64>> = variant_labels
+                .iter()
+                .map(|variant| {
+                    let name = format!("{metric} time/{variant}/{dataset}");
+                    timings
+                        .iter()
+                        .find(|t| t.name == name)
+                        .map(|t| t.time.as_nanos() as f64)
+                })
+                .collect();
+            push_row_if_any(
+                &mut rows,
+                format!("{metric} time/{dataset}"),
+                RowFormat::DurationNanos,
+                cells,
+            );
+        }
+
+        // Correctness — every variant emits this.
+        let correctness_cells: Vec<Option<f64>> = variant_labels
+            .iter()
+            .map(|variant| {
+                let name = format!("correctness-max-diff/{variant}/{dataset}");
+                verification
+                    .iter()
+                    .find(|m| m.name == name)
+                    .map(|m| m.value)
+            })
+            .collect();
+        push_row_if_any(
+            &mut rows,
+            format!("correctness-max-diff/{dataset}"),
+            RowFormat::AbsDiff,
+            correctness_cells,
+        );
+
+        // Recall — only `vortex-turboquant` emits this, and only when
+        // `--recall-queries > 0`. The row is skipped entirely when no
+        // variant has a value (which is the common dev case of passing
+        // `--recall-queries 0` for fast iteration).
+        let recall_cells: Vec<Option<f64>> = variant_labels
+            .iter()
+            .map(|variant| {
+                let name = format!("recall@{recall_k}/{variant}/{dataset}");
+                recalls.iter().find(|m| m.name == name).map(|m| m.value)
+            })
+            .collect();
+        push_row_if_any(
+            &mut rows,
+            format!("recall@{recall_k}/{dataset}"),
+            RowFormat::Recall,
+            recall_cells,
+        );
+    }
+
+    rows
+}
+
+/// Push a row into `rows` only if at least one cell has a value.
+///
+/// Used to suppress entirely-empty rows like `compress time` when the user
+/// passes `--formats handrolled` (no vortex variants) or `recall@k` when
+/// `--recall-queries 0`.
+fn push_row_if_any(rows: &mut Vec<Row>, label: String, format: RowFormat, cells: Vec<Option<f64>>) {
+    if cells.iter().any(Option::is_some) {
+        rows.push(Row {
+            label,
+            format,
+            cells,
+        });
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn row_format_formats_duration_as_ms() {
+        assert_eq!(RowFormat::DurationNanos.format(1_500_000.0), "1.50 ms");
+        assert_eq!(RowFormat::DurationNanos.format(0.0), "0.00 ms");
+    }
+
+    #[test]
+    fn row_format_formats_bytes_as_mb() {
+        assert_eq!(RowFormat::Bytes.format(1_048_576.0), "1.00 MB");
+        assert_eq!(RowFormat::Bytes.format(314_572_800.0), "300.00 MB");
+    }
+
+    #[test]
+    fn row_format_formats_absdiff_scientific() {
+        assert_eq!(RowFormat::AbsDiff.format(0.005459249), "5.46e-3");
+        assert_eq!(RowFormat::AbsDiff.format(0.0), "0.00e0");
+    }
+
+    #[test]
+    fn row_format_formats_recall_fourdp() {
+        assert_eq!(RowFormat::Recall.format(0.987654321), "0.9877");
+        assert_eq!(RowFormat::Recall.format(1.0), "1.0000");
+    }
+
+    #[test]
+    fn render_smoke_two_variants_one_row() -> Result<()> {
+        let variants = vec!["handrolled".to_owned(), "vortex-uncompressed".to_owned()];
+        let rows = vec![Row {
+            label: "decompress time/cohere-small".to_owned(),
+            format: RowFormat::DurationNanos,
+            cells: vec![Some(2_000_000.0), Some(4_000_000.0)],
+        }];
+
+        let mut out = Vec::new();
+        render_variants_table(&mut out, &variants, &rows, None)?;
+        let rendered = String::from_utf8(out).expect("table is utf8");
+
+        assert!(rendered.contains("Benchmark"), "missing header: {rendered}");
+        assert!(
+            rendered.contains("handrolled"),
+            "missing variant: {rendered}"
+        );
+        assert!(
+            rendered.contains("vortex-uncompressed"),
+            "missing variant: {rendered}"
+        );
+        assert!(
+            rendered.contains("decompress time/cohere-small"),
+            "missing row label: {rendered}"
+        );
+        assert!(
+            rendered.contains("2.00 ms"),
+            "missing baseline cell: {rendered}"
+        );
+        assert!(
+            rendered.contains("4.00 ms"),
+            "missing non-baseline cell: {rendered}"
+        );
+        assert!(
+            rendered.contains("(2.00)"),
+            "missing ratio annotation: {rendered}"
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn render_with_missing_first_cell_promotes_next_variant_to_baseline() -> Result<()> {
+        // Simulates the `compress time` row: `handrolled` (first column) has
+        // no value, so `vortex-uncompressed` should become the effective
+        // baseline and render with no ratio, while `vortex-turboquant`
+        // renders a ratio against it.
+        let variants = vec![
+            "handrolled".to_owned(),
+            "vortex-uncompressed".to_owned(),
+            "vortex-turboquant".to_owned(),
+        ];
+        let rows = vec![Row {
+            label: "compress time/cohere-small".to_owned(),
+            format: RowFormat::DurationNanos,
+            cells: vec![None, Some(1_000_000.0), Some(3_000_000.0)],
+        }];
+
+        let mut out = Vec::new();
+        render_variants_table(&mut out, &variants, &rows, None)?;
+        let rendered = String::from_utf8(out).expect("table is utf8");
+
+        // The missing cell renders as "-".
+        assert!(
+            rendered.contains(" - "),
+            "missing dash placeholder: {rendered}"
+        );
+        // The baseline cell has no ratio annotation (the `(...)` suffix).
+        assert!(
+            rendered.contains("1.00 ms"),
+            "missing baseline cell: {rendered}"
+        );
+        // The non-baseline cell has a ratio against the promoted baseline.
+        assert!(
+            rendered.contains("3.00 ms (3.00)"),
+            "missing non-baseline ratio: {rendered}"
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn preferred_baseline_is_used_when_present() -> Result<()> {
+        // With `preferred_baseline = Some("vortex-uncompressed")`, the
+        // middle column becomes the ratio baseline even though there's a
+        // valid value in the first column. `handrolled` (2 ms) renders as
+        // "0.50" relative to `vortex-uncompressed` (4 ms), and
+        // `vortex-turboquant` (8 ms) renders as "2.00".
+        let variants = vec![
+            "handrolled".to_owned(),
+            "vortex-uncompressed".to_owned(),
+            "vortex-turboquant".to_owned(),
+        ];
+        let rows = vec![Row {
+            label: "decompress time/cohere-small".to_owned(),
+            format: RowFormat::DurationNanos,
+            cells: vec![Some(2_000_000.0), Some(4_000_000.0), Some(8_000_000.0)],
+        }];
+
+        let mut out = Vec::new();
+        render_variants_table(&mut out, &variants, &rows, Some("vortex-uncompressed"))?;
+        let rendered = String::from_utf8(out).expect("table is utf8");
+
+        // The baseline column has no ratio suffix.
+        assert!(
+            rendered.contains("4.00 ms") && !rendered.contains("4.00 ms ("),
+            "vortex-uncompressed should render without a ratio: {rendered}"
+        );
+        // Handrolled is now expressed relative to the preferred baseline.
+        assert!(
+            rendered.contains("2.00 ms (0.50)"),
+            "handrolled should render with ratio 0.50: {rendered}"
+        );
+        assert!(
+            rendered.contains("8.00 ms (2.00)"),
+            "vortex-turboquant should render with ratio 2.00: {rendered}"
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn preferred_baseline_falls_back_when_row_has_no_preferred_value() -> Result<()> {
+        // `compress time` again: preferred baseline is vortex-uncompressed,
+        // but when vortex-uncompressed has no value for this row (simulated
+        // with `None`), the function must fall back to the first-non-None
+        // column rather than panicking or skipping the row.
+        let variants = vec![
+            "vortex-uncompressed".to_owned(),
+            "vortex-default".to_owned(),
+            "vortex-turboquant".to_owned(),
+        ];
+        let rows = vec![Row {
+            label: "compress time/cohere-small".to_owned(),
+            format: RowFormat::DurationNanos,
+            cells: vec![None, Some(2_000_000.0), Some(6_000_000.0)],
+        }];
+
+        let mut out = Vec::new();
+        render_variants_table(&mut out, &variants, &rows, Some("vortex-uncompressed"))?;
+        let rendered = String::from_utf8(out).expect("table is utf8");
+
+        // vortex-default is now the promoted baseline (no ratio).
+        assert!(
+            rendered.contains("2.00 ms") && !rendered.contains("2.00 ms ("),
+            "vortex-default should render as promoted baseline: {rendered}"
+        );
+        // vortex-turboquant is 3x vortex-default.
+        assert!(
+            rendered.contains("6.00 ms (3.00)"),
+            "vortex-turboquant should render ratio 3.00 vs promoted baseline: {rendered}"
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn preferred_baseline_label_not_in_variants_degrades_gracefully() -> Result<()> {
+        // If the caller passes a label that isn't in the variant list
+        // (e.g. `--formats handrolled,vortex-turboquant` with preferred
+        // `vortex-uncompressed`), the function must not panic and should
+        // fall back to first-non-None.
+        let variants = vec!["handrolled".to_owned(), "vortex-turboquant".to_owned()];
+        let rows = vec![Row {
+            label: "decompress time/cohere-small".to_owned(),
+            format: RowFormat::DurationNanos,
+            cells: vec![Some(1_000_000.0), Some(2_000_000.0)],
+        }];
+
+        let mut out = Vec::new();
+        render_variants_table(&mut out, &variants, &rows, Some("vortex-uncompressed"))?;
+        let rendered = String::from_utf8(out).expect("table is utf8");
+
+        // First column (handrolled) is the fallback baseline, second has a 2.00 ratio.
+        assert!(
+            rendered.contains("2.00 ms (2.00)"),
+            "fallback baseline ratio missing: {rendered}"
+        );
+        Ok(())
+    }
+}
diff --git a/benchmarks/vector-search-bench/src/handrolled_baseline.rs b/benchmarks/vector-search-bench/src/handrolled_baseline.rs
index 2a22f630ab8..f62fe0710cf 100644
--- a/benchmarks/vector-search-bench/src/handrolled_baseline.rs
+++ b/benchmarks/vector-search-bench/src/handrolled_baseline.rs
@@ -38,6 +38,7 @@
 //! fair DBMS comparison. Future work could add DuckDB / DataFusion baselines alongside
 //! this one for the DBMS-level comparison.
 
+use std::borrow::Cow;
 use std::fs::File;
 use std::path::Path;
 use std::time::Duration;
@@ -57,8 +58,14 @@ use arrow_array::RecordBatch;
 use arrow_array::cast::AsArray;
 use arrow_schema::DataType;
 use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+use vortex_bench::Format;
+use vortex_bench::measurements::CompressionTimingMeasurement;
+use vortex_bench::measurements::CustomUnitMeasurement;
 
+use crate::DEFAULT_THRESHOLD;
 use crate::VariantTimings;
+use crate::verify::VerificationKind;
+use crate::verify::verify_and_report_scores;
 
 /// Read the entire `emb` column of a parquet file into a single flat `Vec<f32>`, along
 /// with the dimension and row count. This is the *decompress* phase of the hand-rolled
@@ -332,6 +339,104 @@ pub fn filter_loop(scores: &[f32], threshold: f32) -> Vec<bool> {
     scores.iter().map(|&s| s > threshold).collect()
 }
 
+/// Run the hand-rolled timing + correctness pipeline for one dataset and
+/// append the resulting measurements into the caller's collection vecs.
+///
+/// This is the "push-side" wrapper around [`run_handrolled_baseline_timings`]
+/// and [`verify_and_report_scores`]: the bench-loop in `main.rs` used to
+/// inline this block of code alongside the vortex-variant loop, which was
+/// long enough to obscure the actual dataset iteration. Extracting it keeps
+/// `main.rs::main` focused on the outer control flow.
+///
+/// The emitted measurement grammar is identical to the inlined version
+/// (same names, same [`Format::Parquet`] target), which matters because
+/// `gh-json` output is what CI consumes.
+///
+/// # Parameters
+///
+/// - `parquet_path`: on-disk parquet file for this dataset (the decompress
+///   phase re-reads it on every iteration).
+/// - `dataset_name`: the [`crate::PreparedDataset::name`] used as the
+///   dataset segment of every metric name.
+/// - `parquet_bytes`: size of `parquet_path` in bytes, emitted as the
+///   `handrolled size/<dataset>` measurement.
+/// - `query`: the single-row query vector, forwarded to the cosine loop.
+/// - `baseline_scores`: ground-truth cosine scores for the verification
+///   pass. A drift from these bails the whole run.
+/// - `iterations`: number of timed iterations per phase (best-of-N).
+/// - `timings`, `sizes`, `verification`: the caller's collection vecs.
+///   This function **appends** to them — it does not replace or sort.
+#[allow(clippy::too_many_arguments)]
+pub fn run_handrolled_and_collect(
+    parquet_path: &Path,
+    dataset_name: &str,
+    parquet_bytes: u64,
+    query: &[f32],
+    baseline_scores: &[f32],
+    iterations: usize,
+    timings: &mut Vec<CompressionTimingMeasurement>,
+    sizes: &mut Vec<CustomUnitMeasurement>,
+    verification: &mut Vec<CustomUnitMeasurement>,
+) -> Result<()> {
+    let label = "handrolled";
+    let bench_name = format!("{label}/{dataset_name}");
+
+    // Timing runs first and returns the cosine scores from its final
+    // iteration; verification then reuses those scores rather than
+    // re-reading the parquet file. `cosine_loop` is deterministic, so
+    // the last-iteration scores equal what a separate pre-timing
+    // verification pass would produce — we just save one parquet read
+    // per dataset. If the scores drift from the Vortex baseline,
+    // `verify_and_report_scores` bails here (after the timing already
+    // ran, which is acceptable because the handrolled loop is cheap and
+    // we'd rather run it twice than skip correctness).
+    let result =
+        run_handrolled_baseline_timings(parquet_path, query, DEFAULT_THRESHOLD, iterations)?;
+
+    let report = verify_and_report_scores(
+        &bench_name,
+        &result.last_scores,
+        baseline_scores,
+        VerificationKind::Lossless,
+    )?;
+    tracing::info!(
+        "{} verification (Lossless): max_abs_diff={:.2e}, mean_abs_diff={:.2e}",
+        bench_name,
+        report.max_abs_diff,
+        report.mean_abs_diff,
+    );
+
+    verification.push(CustomUnitMeasurement {
+        name: format!("correctness-max-diff/{bench_name}"),
+        format: Format::Parquet,
+        unit: Cow::from("abs-diff"),
+        value: report.max_abs_diff,
+    });
+    sizes.push(CustomUnitMeasurement {
+        name: format!("{label} size/{dataset_name}"),
+        format: Format::Parquet,
+        unit: Cow::from("bytes"),
+        value: parquet_bytes as f64,
+    });
+    timings.push(CompressionTimingMeasurement {
+        name: format!("decompress time/{bench_name}"),
+        format: Format::Parquet,
+        time: result.timings.decompress,
+    });
+    timings.push(CompressionTimingMeasurement {
+        name: format!("cosine-similarity time/{bench_name}"),
+        format: Format::Parquet,
+        time: result.timings.cosine,
+    });
+    timings.push(CompressionTimingMeasurement {
+        name: format!("cosine-filter time/{bench_name}"),
+        format: Format::Parquet,
+        time: result.timings.filter,
+    });
+
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {
     use std::fs::File;
diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs
index 4ae3fce26e2..9c0a7aec0c8 100644
--- a/benchmarks/vector-search-bench/src/lib.rs
+++ b/benchmarks/vector-search-bench/src/lib.rs
@@ -32,6 +32,7 @@
 use std::time::Duration;
 use std::time::Instant;
 
+pub mod display;
 pub mod handrolled_baseline;
 pub mod recall;
 pub mod verify;
diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs
index 5e78563ef22..3685241eb74 100644
--- a/benchmarks/vector-search-bench/src/main.rs
+++ b/benchmarks/vector-search-bench/src/main.rs
@@ -20,16 +20,16 @@
 //! [`handrolled_baseline`](vector_search_bench::handrolled_baseline) for details.
 
 use std::borrow::Cow;
-use std::io::Write;
 use std::path::PathBuf;
 
 use anyhow::Context;
 use anyhow::Result;
 use clap::Parser;
 use indicatif::ProgressBar;
-use vector_search_bench::DEFAULT_THRESHOLD;
 use vector_search_bench::Variant;
-use vector_search_bench::handrolled_baseline::run_handrolled_baseline_timings;
+use vector_search_bench::display::build_table_rows;
+use vector_search_bench::display::render_variants_table;
+use vector_search_bench::handrolled_baseline::run_handrolled_and_collect;
 use vector_search_bench::prepare_dataset;
 use vector_search_bench::prepare_variant;
 use vector_search_bench::recall::DEFAULT_TOP_K;
@@ -37,9 +37,7 @@ use vector_search_bench::recall::measure_recall_at_k;
 use vector_search_bench::run_timings;
 use vector_search_bench::verify::VerificationKind;
 use vector_search_bench::verify::compute_cosine_scores;
-use vector_search_bench::verify::verify_and_report_scores;
 use vector_search_bench::verify::verify_variant;
-use vortex_bench::Format;
 use vortex_bench::create_output_writer;
 use vortex_bench::datasets::Dataset;
 use vortex_bench::display::DisplayFormat;
@@ -155,6 +153,19 @@ impl SelectableFormat {
             SelectableFormat::VortexTurboQuant => Some(Variant::VortexTurboQuant),
         }
     }
+
+    /// Stable kebab-cased label for this variant, used as both the
+    /// `--formats` CLI value and the column label in the `-d table`
+    /// output. Must match the `#[clap(name = ...)]` attribute on each
+    /// enum variant — they sit adjacent so they can't drift.
+    fn label(self) -> &'static str {
+        match self {
+            SelectableFormat::Handrolled => "handrolled",
+            SelectableFormat::VortexUncompressed => "vortex-uncompressed",
+            SelectableFormat::VortexDefault => "vortex-default",
+            SelectableFormat::VortexTurboQuant => "vortex-turboquant",
+        }
+    }
 }
 
 #[tokio::main]
@@ -186,9 +197,16 @@ async fn main() -> Result<()> {
     let mut sizes: Vec<CustomUnitMeasurement> = Vec::new();
     let mut recalls: Vec<CustomUnitMeasurement> = Vec::new();
     let mut verification: Vec<CustomUnitMeasurement> = Vec::new();
+    // Dataset names used in metric strings — populated as the outer loop runs
+    // so the `-d table` row-construction pass knows exactly which datasets to
+    // emit rows for (and in which order). Kept separate from the measurement
+    // vecs because the table-row builder looks up measurements by the exact
+    // `prepared.name` string the push-side code used.
+    let mut dataset_names: Vec<String> = Vec::with_capacity(datasets.len());
 
     for dataset in &datasets {
         let prepared = prepare_dataset(dataset).await?;
+        dataset_names.push(prepared.name.clone());
         tracing::info!(
             "prepared {}: dim={}, num_rows={}",
             prepared.name,
@@ -208,78 +226,28 @@ async fn main() -> Result<()> {
         );
 
         // Hand-rolled baseline. Emitted as a separate pseudo-variant with label
-        // `handrolled` so it shows up in dashboards next to the Vortex variants. This
-        // is a hand-rolled Rust scalar cosine loop over a flat `Vec<f32>` decoded from
-        // parquet via `parquet-rs`; it must match the Vortex cosine scores within the
-        // lossless tolerance (f32 ULPs) because it's computing the same math on the
-        // same underlying f32 values.
-        //
-        // `target.format` stays `Format::Parquet` because the *storage* side is still
-        // parquet on disk — only the *compute* is hand-rolled. The metric `name` field
-        // carries the `handrolled` label so human readers can tell the compute apart
-        // from, say, a DuckDB `list_cosine_similarity` baseline on the same parquet.
-        //
-        // Timing runs first and returns the cosine scores from its final iteration;
-        // verification then reuses those scores rather than re-reading the parquet
-        // file. `cosine_loop` is deterministic, so the last-iteration scores equal
-        // what a separate pre-timing verification pass would produce — we just save
-        // one parquet read per dataset. If the scores drift from the Vortex baseline,
-        // `verify_and_report_scores` bails here (after the timing already ran, which
-        // is acceptable because the handrolled loop is cheap and we'd rather run it
-        // twice than skip correctness).
+        // `handrolled` so it shows up in dashboards next to the Vortex variants.
+        // `target.format` stays `Format::Parquet` because the *storage* side is
+        // still parquet on disk — only the *compute* is hand-rolled. The metric
+        // `name` field carries the `handrolled` label so human readers can tell
+        // the compute apart from, say, a DuckDB `list_cosine_similarity`
+        // baseline on the same parquet. See
+        // [`handrolled_baseline::run_handrolled_and_collect`] for the full
+        // timing / verification / push pipeline — kept in the module that
+        // defines the baseline so this loop stays focused on dataset iteration.
         if run_handrolled_baseline {
             let parquet_path = dataset.to_parquet_path().await?;
-            let label = "handrolled";
-            let bench_name = format!("{label}/{}", prepared.name);
-
-            let baseline_result = run_handrolled_baseline_timings(
+            run_handrolled_and_collect(
                 &parquet_path,
+                &prepared.name,
+                prepared.parquet_bytes,
                 &prepared.query,
-                DEFAULT_THRESHOLD,
-                args.iterations,
-            )?;
-
-            let handrolled_report = verify_and_report_scores(
-                &bench_name,
-                &baseline_result.last_scores,
                 &baseline_scores,
-                VerificationKind::Lossless,
+                args.iterations,
+                &mut timings,
+                &mut sizes,
+                &mut verification,
             )?;
-            tracing::info!(
-                "{} verification (Lossless): max_abs_diff={:.2e}, mean_abs_diff={:.2e}",
-                bench_name,
-                handrolled_report.max_abs_diff,
-                handrolled_report.mean_abs_diff,
-            );
-            verification.push(CustomUnitMeasurement {
-                name: format!("correctness-max-diff/{bench_name}"),
-                format: Format::Parquet,
-                unit: Cow::from("abs-diff"),
-                value: handrolled_report.max_abs_diff,
-            });
-
-            sizes.push(CustomUnitMeasurement {
-                name: format!("{label} size/{}", prepared.name),
-                format: Format::Parquet,
-                unit: Cow::from("bytes"),
-                value: prepared.parquet_bytes as f64,
-            });
-            timings.push(CompressionTimingMeasurement {
-                name: format!("decompress time/{bench_name}"),
-                format: Format::Parquet,
-                time: baseline_result.timings.decompress,
-            });
-            timings.push(CompressionTimingMeasurement {
-                name: format!("cosine-similarity time/{bench_name}"),
-                format: Format::Parquet,
-                time: baseline_result.timings.cosine,
-            });
-            timings.push(CompressionTimingMeasurement {
-                name: format!("cosine-filter time/{bench_name}"),
-                format: Format::Parquet,
-                time: baseline_result.timings.filter,
-            });
-
             progress.inc(1);
         }
 
@@ -382,26 +350,40 @@ async fn main() -> Result<()> {
     let mut writer = create_output_writer(&args.display_format, args.output_path, BENCHMARK_ID)?;
     match args.display_format {
         DisplayFormat::Table => {
-            // Our variants span multiple `Format` values *and* multiple labels that share a
-            // single `Format`, so the existing `render_table` helper (which groups by
-            // `Target`) would collapse them. Emit one line per measurement instead; this is
-            // only used for developer inspection — CI consumes `gh-json` via the arm below.
-            for timing in &timings {
-                writeln!(writer, "{} {} ns", timing.name, timing.time.as_nanos())?;
-            }
-            for size in &sizes {
-                writeln!(writer, "{} {} {}", size.name, size.value, size.unit)?;
-            }
-            for recall in &recalls {
-                writeln!(
-                    writer,
-                    "{} {:.4} {}",
-                    recall.name, recall.value, recall.unit
-                )?;
-            }
-            for check in &verification {
-                writeln!(writer, "{} {:.6e} {}", check.name, check.value, check.unit)?;
-            }
+            // `vector_search_bench::display::render_variants_table` groups
+            // columns by **variant label** rather than by `Target`, because
+            // multiple vector-search variants legitimately share a single
+            // `Format` (e.g. `vortex-uncompressed` and `vortex-default` both
+            // map to `Format::OnDiskVortex`). The generic
+            // `vortex_bench::display::render_table` helper groups by
+            // `Target`, which would collapse those variants into one
+            // column, so we render locally instead. The `DisplayFormat::GhJson`
+            // arm below is untouched — CI still consumes gh-json byte-for-byte
+            // identically.
+            //
+            // `vortex-uncompressed` is used as the ratio baseline (when
+            // present in the run) so that `handrolled` legitimately renders
+            // as faster-than-baseline and vortex-default / vortex-turboquant
+            // render as ratios of the raw Vortex cost. When the user runs
+            // without `vortex-uncompressed`, the renderer falls back to the
+            // first column with a value.
+            let variant_labels: Vec<String> =
+                args.formats.iter().map(|f| f.label().to_owned()).collect();
+            let rows = build_table_rows(
+                &variant_labels,
+                &dataset_names,
+                args.recall_k,
+                &timings,
+                &sizes,
+                &recalls,
+                &verification,
+            );
+            render_variants_table(
+                &mut writer,
+                &variant_labels,
+                &rows,
+                Some(SelectableFormat::VortexUncompressed.label()),
+            )?;
         }
         DisplayFormat::GhJson => {
             print_measurements_json(&mut writer, timings)?;
diff --git a/benchmarks/vector-search-bench/src/verify.rs b/benchmarks/vector-search-bench/src/verify.rs
index da2a73f1ea2..ee11982826b 100644
--- a/benchmarks/vector-search-bench/src/verify.rs
+++ b/benchmarks/vector-search-bench/src/verify.rs
@@ -161,7 +161,7 @@ pub fn verify_scores(
 ///
 /// Takes already-materialized `variant_scores` (as a `&[f32]`) rather than an
 /// `ArrayRef`, so both the Vortex-variant path (which computes scores via
-/// [`execute_cosine`](crate::execute_cosine)) and the hand-rolled baseline path (which
+/// [`execute_cosine`] and the hand-rolled baseline path (which
 /// runs a plain Rust loop over a flat `Vec<f32>`) share the same error-handling,
 /// logging, and hard-fail logic without duplicating it in `main.rs`.
 ///
diff --git a/vortex-tensor/src/vector_search.rs b/vortex-tensor/src/vector_search.rs
index 6934fa52a71..2521343c0c2 100644
--- a/vortex-tensor/src/vector_search.rs
+++ b/vortex-tensor/src/vector_search.rs
@@ -71,7 +71,7 @@ use crate::vector::Vector;
 ///
 /// The returned array has the shape
 /// `L2Denorm(SorfTransform(FSL(Dict(codes, centroids))), norms)` — exactly what
-/// [`vortex_tensor::encodings::turboquant::TurboQuantScheme`] produces when invoked through
+/// [`crate::encodings::turboquant::TurboQuantScheme`] produces when invoked through
 /// `BtrBlocksCompressorBuilder::with_turboquant()`, but without requiring callers to enable
 /// the `unstable_encodings` feature on the `vortex` facade.
 ///
@@ -105,8 +105,7 @@ pub fn compress_turboquant(data: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResu
 /// Build a `Vector<dim, f32>` extension array whose storage is a [`ConstantArray`] broadcasting
 /// a single query vector across `num_rows` rows.
 ///
-/// This is the shape expected for the RHS of a database-vs-query
-/// [`CosineSimilarity`](crate::scalar_fns::cosine_similarity::CosineSimilarity) scan: the
+/// This is the shape expected for the RHS of a database-vs-query [`CosineSimilarity`] scan: the
 /// `ScalarFnArray` contract requires both children to have the same length, so rather than
 /// hand-rolling a 1-row input we broadcast the query across the whole database.
 ///