vortex-data · a10y · Feb 9, 2026 · Feb 12, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -36,6 +36,7 @@ members = [
     "vortex-tui",
     "vortex-test/e2e",
     "vortex-test/e2e-cuda",
+    "vortex-test/e2e-cuda-scan",
     "xtask",
     # Encodings
     "encodings/fastlanes",

diff --git a/_typos.toml b/_typos.toml
@@ -1,5 +1,5 @@
 [default]
-extend-ignore-identifiers-re = ["FoR", "typ"]
+extend-ignore-identifiers-re = ["ffor", "FFOR", "FoR", "typ"]
 # We support a few common special comments to tell the checker to ignore sections of code
 extend-ignore-re = [
     "(#|//)\\s*spellchecker:ignore-next-line\\n.*",                      # Ignore the next line

diff --git a/vortex-array/src/arrays/primitive/vtable/mod.rs b/vortex-array/src/arrays/primitive/vtable/mod.rs
@@ -88,6 +88,11 @@ impl VTable for PrimitiveVTable {
 
         let ptype = PType::try_from(dtype)?;
 
+        vortex_ensure!(
+            buffer.is_aligned_to(Alignment::new(ptype.byte_width())),
+            "Misaligned buffer cannot be used to build PrimitiveArray of {ptype}"
+        );
+
         if buffer.len() != ptype.byte_width() * len {
             vortex_bail!(
                 "Buffer length {} does not match expected length {} for {}, {}",

diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs
@@ -71,6 +71,15 @@ impl Default for BtrBlocksCompressorBuilder {
 }
 
 impl BtrBlocksCompressorBuilder {
+    /// Create a new builder with no encodings enabled.
+    pub fn empty() -> Self {
+        Self {
+            int_schemes: Default::default(),
+            float_schemes: Default::default(),
+            string_schemes: Default::default(),
+        }
+    }
+
     /// Excludes the specified integer compression schemes.
     pub fn exclude_int(mut self, codes: impl IntoIterator<Item = IntCode>) -> Self {
         let codes: HashSet<_> = codes.into_iter().collect();

diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml
@@ -18,6 +18,7 @@ workspace = true
 
 [features]
 default = []
+tracing = ["dep:tracing"]
 _test-harness = []
 unstable_encodings = ["vortex-zstd/unstable_encodings"]
 
@@ -26,12 +27,17 @@ arc-swap = { workspace = true }
 arrow-data = { workspace = true, features = ["ffi"] }
 arrow-schema = { workspace = true, features = ["ffi"] }
 async-trait = { workspace = true }
+bytes = { workspace = true }
 cudarc = { workspace = true, features = ["f16"] }
 fastlanes = { workspace = true }
 futures = { workspace = true, features = ["executor"] }
 kanal = { workspace = true }
 paste = { workspace = true }
-tracing = { workspace = true }
+tokio = { workspace = true, features = ["fs"] }
+tracing = { workspace = true, features = [
+    "std",
+    "attributes",
+], optional = true }
 vortex-alp = { workspace = true }
 vortex-array = { workspace = true }
 vortex-buffer = { workspace = true }

diff --git a/vortex-cuda/benches/bitpacked_cuda.rs b/vortex-cuda/benches/bitpacked_cuda.rs
@@ -6,27 +6,25 @@
 #![allow(clippy::unwrap_used)]
 #![allow(clippy::cast_possible_truncation)]
 
+mod common;
+
 use std::mem::size_of;
 use std::ops::Add;
+use std::sync::Arc;
+use std::sync::atomic::Ordering;
 use std::time::Duration;
 
 use criterion::BenchmarkId;
 use criterion::Criterion;
 use criterion::Throughput;
 use cudarc::driver::DeviceRepr;
-use cudarc::driver::PushKernelArg;
-use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC;
 use futures::executor::block_on;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::validity::Validity::NonNullable;
 use vortex_buffer::Buffer;
-use vortex_cuda::CudaBufferExt;
-use vortex_cuda::CudaDeviceBuffer;
-use vortex_cuda::CudaExecutionCtx;
+use vortex_cuda::BitPackedExecutor;
 use vortex_cuda::CudaSession;
-use vortex_cuda::bitpacked_cuda_kernel;
-use vortex_cuda::bitpacked_cuda_launch_config;
-use vortex_cuda::launch_cuda_kernel_with_config;
+use vortex_cuda::executor::CudaExecute;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 use vortex_dtype::NativePType;
@@ -35,6 +33,8 @@ use vortex_fastlanes::BitPackedArray;
 use vortex_fastlanes::unpack_iter::BitPacked;
 use vortex_session::VortexSession;
 
+use crate::common::TimedLaunchStrategy;
+
 const N_ROWS: usize = 100_000_000;
 
 /// Create a bit-packed array with the given bit width
@@ -56,54 +56,6 @@ where
         .vortex_expect("failed to create BitPacked array")
 }
 
-/// Launch the bit unpacking kernel and return elapsed GPU time
-fn launch_bitunpack_kernel_timed_typed<T>(
-    bitpacked_array: &BitPackedArray,
-    cuda_ctx: &mut CudaExecutionCtx,
-) -> vortex_error::VortexResult<Duration>
-where
-    T: BitPacked + DeviceRepr,
-    T::Physical: DeviceRepr,
-{
-    let packed = bitpacked_array.packed().clone();
-    let bit_width = bitpacked_array.bit_width();
-    let len = bitpacked_array.len();
-
-    // Move packed data to device if not already there
-    let device_input = if packed.is_on_device() {
-        packed
-    } else {
-        block_on(cuda_ctx.move_to_device(packed)?).vortex_expect("failed to move to device")
-    };
-
-    // Allocate output buffer
-    let output_slice = cuda_ctx
-        .device_alloc::<T>(len.next_multiple_of(1024))
-        .vortex_expect("failed to allocate output");
-    let output_buf = CudaDeviceBuffer::new(output_slice);
-
-    // Get device views
-    let input_view = device_input
-        .cuda_view::<T::Physical>()
-        .vortex_expect("failed to get input view");
-    let output_view = output_buf.as_view::<T>();
-
-    let output_width = size_of::<T>() * 8;
-    let cuda_function = bitpacked_cuda_kernel(bit_width, output_width, cuda_ctx)?;
-    let mut launch_builder = cuda_ctx.launch_builder(&cuda_function);
-
-    launch_builder.arg(&input_view);
-    launch_builder.arg(&output_view);
-
-    let config = bitpacked_cuda_launch_config(output_width, len)?;
-
-    // Launch kernel
-    let events =
-        launch_cuda_kernel_with_config(&mut launch_builder, config, CU_EVENT_BLOCKING_SYNC)?;
-
-    events.duration()
-}
-
 /// Generic benchmark function for a specific type and bit width
 fn benchmark_bitunpack_typed<T>(c: &mut Criterion, bit_width: u8, type_name: &str)
 where
@@ -123,19 +75,18 @@ where
         &array,
         |b, array| {
             b.iter_custom(|iters| {
-                let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
-                    .vortex_expect("failed to create execution context");
+                let timed = TimedLaunchStrategy::default();
+                let timer = Arc::clone(timed.get());
 
-                let mut total_time = Duration::ZERO;
+                let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+                    .vortex_expect("failed to create execution context")
+                    .with_launch_strategy(Arc::new(timed));
 
                 for _ in 0..iters {
-                    let kernel_time =
-                        launch_bitunpack_kernel_timed_typed::<T>(array, &mut cuda_ctx)
-                            .vortex_expect("kernel launch failed");
-                    total_time += kernel_time;
+                    block_on(BitPackedExecutor.execute(array.to_array(), &mut cuda_ctx)).unwrap();
                 }
 
-                total_time
+                Duration::from_nanos(timer.load(Ordering::Relaxed))
             });
         },
     );

diff --git a/vortex-cuda/benches/common/mod.rs b/vortex-cuda/benches/common/mod.rs
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::sync::Arc;
+use std::sync::atomic::AtomicU64;
+use std::sync::atomic::Ordering;
+
+use cudarc::driver::sys::CUevent_flags;
+use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC;
+use vortex_cuda::CudaKernelEvents;
+use vortex_cuda::LaunchStrategy;
+use vortex_error::VortexResult;
+
+#[derive(Debug, Default)]
+pub struct TimedLaunchStrategy {
+    total_time_ns: Arc<AtomicU64>,
+}
+
+impl TimedLaunchStrategy {
+    pub fn get(&self) -> &Arc<AtomicU64> {
+        &self.total_time_ns
+    }
+}
+
+impl LaunchStrategy for TimedLaunchStrategy {
+    fn event_flags(&self) -> CUevent_flags {
+        // using blocking_sync to make sure all events flush before we complete.
+        CU_EVENT_BLOCKING_SYNC
+    }
+
+    fn on_complete(&self, events: &CudaKernelEvents, _len: usize) -> VortexResult<()> {
+        // NOTE: as long as the duration < 584 years this cast is safe.
+        let elapsed_nanos = events.duration()?.as_nanos() as u64;
+        self.total_time_ns
+            .fetch_add(elapsed_nanos, Ordering::Relaxed);
+
+        Ok(())
+    }
+}