From 3f7a8289aa75c00bfcb1998d2f024ca1260e576e Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Mon, 9 Feb 2026 16:16:50 -0500
Subject: [PATCH 01/14] measure scans

Signed-off-by: Andrew Duffy <andrew@a10y.dev>

fixup

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 Cargo.lock                                    |  26 +++
 Cargo.toml                                    |   1 +
 _typos.toml                                   |   2 +-
 .../src/arrays/primitive/vtable/mod.rs        |   5 +
 vortex-btrblocks/src/builder.rs               |   9 +
 vortex-cuda/Cargo.toml                        |   5 +-
 vortex-cuda/benches/bitpacked_cuda.rs         |  79 ++------
 vortex-cuda/benches/common/mod.rs             |  39 ++++
 vortex-cuda/benches/date_time_parts_cuda.rs   | 106 ++---------
 vortex-cuda/benches/dict_cuda.rs              |  94 +++-------
 vortex-cuda/benches/for_cuda.rs               | 123 +++++++-----
 vortex-cuda/benches/runend_cuda.rs            |  90 ++-------
 vortex-cuda/src/executor.rs                   |  72 ++++++-
 vortex-cuda/src/kernel/arrays/constant.rs     |  35 ++--
 vortex-cuda/src/kernel/arrays/dict.rs         |  59 +++---
 vortex-cuda/src/kernel/arrays/shared.rs       |   5 +
 vortex-cuda/src/kernel/encodings/alp.rs       |  29 ++-
 vortex-cuda/src/kernel/encodings/bitpacked.rs |  24 +--
 .../src/kernel/encodings/date_time_parts.rs   |  24 +--
 .../kernel/encodings/decimal_byte_parts.rs    |   5 +
 vortex-cuda/src/kernel/encodings/for_.rs      |  20 +-
 vortex-cuda/src/kernel/encodings/runend.rs    |  28 +--
 vortex-cuda/src/kernel/encodings/sequence.rs  |  20 +-
 vortex-cuda/src/kernel/encodings/zigzag.rs    |  20 +-
 vortex-cuda/src/kernel/encodings/zstd.rs      |  54 +++++-
 vortex-cuda/src/kernel/filter/mod.rs          |   4 +
 vortex-cuda/src/kernel/mod.rs                 |  93 ++++-----
 vortex-cuda/src/kernel/patches/mod.rs         |  25 +--
 vortex-cuda/src/kernel/slice/mod.rs           |   4 +
 vortex-cuda/src/lib.rs                        |  34 ++--
 vortex-cuda/src/macros.rs                     |  42 +++++
 vortex-cuda/src/session.rs                    |   7 +-
 vortex-cuda/src/stream.rs                     |  10 +-
 vortex-python/src/arrow.rs                    |   2 +-
 vortex-test/e2e-cuda-scan/Cargo.toml          |  24 +++
 vortex-test/e2e-cuda-scan/src/main.rs         | 177 ++++++++++++++++++
 vortex/src/lib.rs                             |   3 +
 37 files changed, 827 insertions(+), 572 deletions(-)
 create mode 100644 vortex-cuda/benches/common/mod.rs
 create mode 100644 vortex-cuda/src/macros.rs
 create mode 100644 vortex-test/e2e-cuda-scan/Cargo.toml
 create mode 100644 vortex-test/e2e-cuda-scan/src/main.rs

diff --git a/Cargo.lock b/Cargo.lock
index 59bfa30932b..b186b96695a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9582,6 +9582,16 @@ dependencies = [
  "tracing-subscriber",
 ]
 
+[[package]]
+name = "tracing-serde"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1"
+dependencies = [
+ "serde",
+ "tracing-core",
+]
+
 [[package]]
 name = "tracing-subscriber"
 version = "0.3.22"
@@ -9592,12 +9602,15 @@ dependencies = [
  "nu-ansi-term",
  "once_cell",
  "regex-automata",
+ "serde",
+ "serde_json",
  "sharded-slab",
  "smallvec",
  "thread_local",
  "tracing",
  "tracing-core",
  "tracing-log",
+ "tracing-serde",
 ]
 
 [[package]]
@@ -10120,6 +10133,7 @@ dependencies = [
  "arrow-schema",
  "async-trait",
  "bindgen",
+ "bytes",
  "codspeed-criterion-compat-walltime",
  "cudarc",
  "fastlanes",
@@ -10830,6 +10844,18 @@ dependencies = [
  "vortex-cuda",
 ]
 
+[[package]]
+name = "vortex-test-e2e-cuda-scan"
+version = "0.1.0"
+dependencies = [
+ "futures",
+ "tokio",
+ "tracing",
+ "tracing-subscriber",
+ "vortex",
+ "vortex-cuda",
+]
+
 [[package]]
 name = "vortex-tui"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index bd82b6bac7f..7ab1c634acf 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -35,6 +35,7 @@ members = [
     "vortex-tui",
     "vortex-test/e2e",
     "vortex-test/e2e-cuda",
+    "vortex-test/e2e-cuda-scan",
     "xtask",
     # Encodings
     "encodings/fastlanes",
diff --git a/_typos.toml b/_typos.toml
index b2af33e423b..5d601d1036f 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -1,5 +1,5 @@
 [default]
-extend-ignore-identifiers-re = ["FoR", "typ"]
+extend-ignore-identifiers-re = ["ffor", "FFOR", "FoR", "typ"]
 # We support a few common special comments to tell the checker to ignore sections of code
 extend-ignore-re = [
     "(#|//)\\s*spellchecker:ignore-next-line\\n.*",                      # Ignore the next line
diff --git a/vortex-array/src/arrays/primitive/vtable/mod.rs b/vortex-array/src/arrays/primitive/vtable/mod.rs
index d316e6e93cd..450f9faf99e 100644
--- a/vortex-array/src/arrays/primitive/vtable/mod.rs
+++ b/vortex-array/src/arrays/primitive/vtable/mod.rs
@@ -89,6 +89,11 @@ impl VTable for PrimitiveVTable {
 
         let ptype = PType::try_from(dtype)?;
 
+        vortex_ensure!(
+            buffer.is_aligned_to(Alignment::new(ptype.byte_width())),
+            "Misaligned buffer cannot be used to build PrimitiveArray of {ptype}"
+        );
+
         if buffer.len() != ptype.byte_width() * len {
             vortex_bail!(
                 "Buffer length {} does not match expected length {} for {}, {}",
diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs
index b71ca7f9caf..d329ec8c139 100644
--- a/vortex-btrblocks/src/builder.rs
+++ b/vortex-btrblocks/src/builder.rs
@@ -71,6 +71,15 @@ impl Default for BtrBlocksCompressorBuilder {
 }
 
 impl BtrBlocksCompressorBuilder {
+    /// Create a new builder with no encodings enabled.
+    pub fn empty() -> Self {
+        Self {
+            int_schemes: Default::default(),
+            float_schemes: Default::default(),
+            string_schemes: Default::default(),
+        }
+    }
+
     /// Excludes the specified integer compression schemes.
     pub fn exclude_int(mut self, codes: impl IntoIterator<Item = IntCode>) -> Self {
         let codes: HashSet<_> = codes.into_iter().collect();
diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml
index 0636fc3a91d..bd4afb972f6 100644
--- a/vortex-cuda/Cargo.toml
+++ b/vortex-cuda/Cargo.toml
@@ -19,6 +19,7 @@ workspace = true
 
 [features]
 default = []
+tracing = ["dep:tracing"]
 _test-harness = []
 unstable_encodings = ["vortex-zstd/unstable_encodings"]
 
@@ -27,6 +28,7 @@ arc-swap = { workspace = true }
 arrow-data = { workspace = true, features = ["ffi"] }
 arrow-schema = { workspace = true, features = ["ffi"] }
 async-trait = { workspace = true }
+bytes = { workspace = true }
 cudarc = { workspace = true, features = ["f16"] }
 fastlanes = { workspace = true }
 flatbuffers = { workspace = true }
@@ -34,7 +36,8 @@ futures = { workspace = true, features = ["executor"] }
 kanal = { workspace = true }
 paste = { workspace = true }
 prost = { workspace = true }
-tracing = { workspace = true }
+tokio = { workspace = true, features = ["fs"] }
+tracing = { workspace = true, features = ["std", "attributes"] }
 vortex-alp = { workspace = true }
 vortex-array = { workspace = true }
 vortex-buffer = { workspace = true }
diff --git a/vortex-cuda/benches/bitpacked_cuda.rs b/vortex-cuda/benches/bitpacked_cuda.rs
index b551c91c9e1..0ef0e7f03f0 100644
--- a/vortex-cuda/benches/bitpacked_cuda.rs
+++ b/vortex-cuda/benches/bitpacked_cuda.rs
@@ -6,27 +6,25 @@
 #![allow(clippy::unwrap_used)]
 #![allow(clippy::cast_possible_truncation)]
 
+mod common;
+
 use std::mem::size_of;
 use std::ops::Add;
+use std::sync::Arc;
+use std::sync::atomic::Ordering;
 use std::time::Duration;
 
 use criterion::BenchmarkId;
 use criterion::Criterion;
 use criterion::Throughput;
 use cudarc::driver::DeviceRepr;
-use cudarc::driver::PushKernelArg;
-use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC;
 use futures::executor::block_on;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::validity::Validity::NonNullable;
 use vortex_buffer::Buffer;
-use vortex_cuda::CudaBufferExt;
-use vortex_cuda::CudaDeviceBuffer;
-use vortex_cuda::CudaExecutionCtx;
+use vortex_cuda::BitPackedExecutor;
 use vortex_cuda::CudaSession;
-use vortex_cuda::bitpacked_cuda_kernel;
-use vortex_cuda::bitpacked_cuda_launch_config;
-use vortex_cuda::launch_cuda_kernel_with_config;
+use vortex_cuda::executor::CudaExecute;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 use vortex_dtype::NativePType;
@@ -35,6 +33,8 @@ use vortex_fastlanes::BitPackedArray;
 use vortex_fastlanes::unpack_iter::BitPacked;
 use vortex_session::VortexSession;
 
+use crate::common::TimedLaunchStrategy;
+
 const N_ROWS: usize = 100_000_000;
 
 /// Create a bit-packed array with the given bit width
@@ -56,54 +56,6 @@ where
         .vortex_expect("failed to create BitPacked array")
 }
 
-/// Launch the bit unpacking kernel and return elapsed GPU time
-fn launch_bitunpack_kernel_timed_typed<T>(
-    bitpacked_array: &BitPackedArray,
-    cuda_ctx: &mut CudaExecutionCtx,
-) -> vortex_error::VortexResult<Duration>
-where
-    T: BitPacked + DeviceRepr,
-    T::Physical: DeviceRepr,
-{
-    let packed = bitpacked_array.packed().clone();
-    let bit_width = bitpacked_array.bit_width();
-    let len = bitpacked_array.len();
-
-    // Move packed data to device if not already there
-    let device_input = if packed.is_on_device() {
-        packed
-    } else {
-        block_on(cuda_ctx.move_to_device(packed)?).vortex_expect("failed to move to device")
-    };
-
-    // Allocate output buffer
-    let output_slice = cuda_ctx
-        .device_alloc::<T>(len.next_multiple_of(1024))
-        .vortex_expect("failed to allocate output");
-    let output_buf = CudaDeviceBuffer::new(output_slice);
-
-    // Get device views
-    let input_view = device_input
-        .cuda_view::<T::Physical>()
-        .vortex_expect("failed to get input view");
-    let output_view = output_buf.as_view::<T>();
-
-    let output_width = size_of::<T>() * 8;
-    let cuda_function = bitpacked_cuda_kernel(bit_width, output_width, cuda_ctx)?;
-    let mut launch_builder = cuda_ctx.launch_builder(&cuda_function);
-
-    launch_builder.arg(&input_view);
-    launch_builder.arg(&output_view);
-
-    let config = bitpacked_cuda_launch_config(output_width, len)?;
-
-    // Launch kernel
-    let events =
-        launch_cuda_kernel_with_config(&mut launch_builder, config, CU_EVENT_BLOCKING_SYNC)?;
-
-    events.duration()
-}
-
 /// Generic benchmark function for a specific type and bit width
 fn benchmark_bitunpack_typed<T>(c: &mut Criterion, bit_width: u8, type_name: &str)
 where
@@ -123,19 +75,18 @@ where
         &array,
         |b, array| {
             b.iter_custom(|iters| {
-                let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
-                    .vortex_expect("failed to create execution context");
+                let timed = TimedLaunchStrategy::default();
+                let timer = Arc::clone(timed.get());
 
-                let mut total_time = Duration::ZERO;
+                let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+                    .vortex_expect("failed to create execution context")
+                    .with_launch_strategy(Arc::new(timed));
 
                 for _ in 0..iters {
-                    let kernel_time =
-                        launch_bitunpack_kernel_timed_typed::<T>(array, &mut cuda_ctx)
-                            .vortex_expect("kernel launch failed");
-                    total_time += kernel_time;
+                    block_on(BitPackedExecutor.execute(array.to_array(), &mut cuda_ctx)).unwrap();
                 }
 
-                total_time
+                Duration::from_nanos(timer.load(Ordering::Relaxed))
             });
         },
     );
diff --git a/vortex-cuda/benches/common/mod.rs b/vortex-cuda/benches/common/mod.rs
new file mode 100644
index 00000000000..94273ae599d
--- /dev/null
+++ b/vortex-cuda/benches/common/mod.rs
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::sync::Arc;
+use std::sync::atomic::AtomicU64;
+use std::sync::atomic::Ordering;
+
+use cudarc::driver::sys::CUevent_flags;
+use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC;
+use vortex_cuda::CudaKernelEvents;
+use vortex_cuda::LaunchStrategy;
+use vortex_error::VortexResult;
+
+#[derive(Debug, Default)]
+pub struct TimedLaunchStrategy {
+    total_time_ns: Arc<AtomicU64>,
+}
+
+impl TimedLaunchStrategy {
+    pub fn get(&self) -> &Arc<AtomicU64> {
+        &self.total_time_ns
+    }
+}
+
+impl LaunchStrategy for TimedLaunchStrategy {
+    fn event_flags(&self) -> CUevent_flags {
+        // using blocking_sync to make sure all events flush before we complete.
+        CU_EVENT_BLOCKING_SYNC
+    }
+
+    fn on_complete(&self, events: &CudaKernelEvents, _len: usize) -> VortexResult<()> {
+        // NOTE: as long as the duration < 584 years this cast is safe.
+        let elapsed_nanos = events.duration()?.as_nanos() as u64;
+        self.total_time_ns
+            .fetch_add(elapsed_nanos, Ordering::Relaxed);
+
+        Ok(())
+    }
+}
diff --git a/vortex-cuda/benches/date_time_parts_cuda.rs b/vortex-cuda/benches/date_time_parts_cuda.rs
index df38a563363..4a142974082 100644
--- a/vortex-cuda/benches/date_time_parts_cuda.rs
+++ b/vortex-cuda/benches/date_time_parts_cuda.rs
@@ -6,34 +6,37 @@
 #![allow(clippy::unwrap_used)]
 #![allow(clippy::cast_possible_truncation)]
 
+mod common;
+
 use std::mem::size_of;
+use std::sync::Arc;
+use std::sync::atomic::Ordering;
 use std::time::Duration;
 
 use criterion::BenchmarkId;
 use criterion::Criterion;
 use criterion::Throughput;
-use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC;
 use futures::executor::block_on;
 use vortex_array::IntoArray;
-use vortex_array::ToCanonical;
 use vortex_array::arrays::ConstantArray;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::validity::Validity;
 use vortex_buffer::Buffer;
-use vortex_cuda::CudaBufferExt;
-use vortex_cuda::CudaExecutionCtx;
 use vortex_cuda::CudaSession;
+use vortex_cuda::DateTimePartsExecutor;
+use vortex_cuda::executor::CudaExecute;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 use vortex_datetime_parts::DateTimePartsArray;
 use vortex_dtype::DType;
 use vortex_dtype::Nullability;
-use vortex_dtype::PType;
 use vortex_dtype::datetime::TimeUnit;
 use vortex_dtype::datetime::Timestamp;
 use vortex_error::VortexExpect;
 use vortex_session::VortexSession;
 
+use crate::common::TimedLaunchStrategy;
+
 fn make_datetimeparts_array(len: usize, time_unit: TimeUnit) -> DateTimePartsArray {
     let days: Vec<i16> = (0..len).map(|i| (i / 1000) as i16).collect();
     let days_arr = PrimitiveArray::new(Buffer::from(days), Validity::NonNullable).into_array();
@@ -46,80 +49,6 @@ fn make_datetimeparts_array(len: usize, time_unit: TimeUnit) -> DateTimePartsArr
         .vortex_expect("Failed to create DateTimePartsArray")
 }
 
-/// Launches DateTimeParts decode kernel and returns elapsed GPU time.
-fn launch_datetimeparts_kernel_timed(
-    dtp_array: &DateTimePartsArray,
-    time_unit: TimeUnit,
-    cuda_ctx: &mut CudaExecutionCtx,
-) -> vortex_error::VortexResult<Duration> {
-    let days_prim = dtp_array.days().to_primitive();
-
-    // TODO(0ax1): figure out how to represent constant array in CUDA kernels
-    let seconds_prim = dtp_array.seconds().to_primitive();
-    let subseconds_prim = dtp_array.subseconds().to_primitive();
-
-    let output_len = dtp_array.len();
-
-    let divisor: i64 = match time_unit {
-        TimeUnit::Nanoseconds => 1_000_000_000,
-        TimeUnit::Microseconds => 1_000_000,
-        TimeUnit::Milliseconds => 1_000,
-        TimeUnit::Seconds => 1,
-        TimeUnit::Days => unreachable!("Days not supported for DateTimeParts"),
-    };
-
-    let days_device = block_on(
-        cuda_ctx
-            .copy_to_device(days_prim.as_slice::<i16>().to_vec())
-            .unwrap(),
-    )
-    .vortex_expect("failed to copy days to device");
-
-    let seconds_device = block_on(
-        cuda_ctx
-            .copy_to_device(seconds_prim.as_slice::<i8>().to_vec())
-            .unwrap(),
-    )
-    .vortex_expect("failed to copy seconds to device");
-
-    let subseconds_device = block_on(
-        cuda_ctx
-            .copy_to_device(subseconds_prim.as_slice::<i8>().to_vec())
-            .unwrap(),
-    )
-    .vortex_expect("failed to copy subseconds to device");
-
-    // Allocate output buffer
-    let output_device = block_on(cuda_ctx.copy_to_device(vec![0i64; output_len]).unwrap())
-        .vortex_expect("failed to allocate output buffer");
-
-    let days_view = days_device
-        .cuda_view::<i16>()
-        .vortex_expect("failed to get days view");
-    let seconds_view = seconds_device
-        .cuda_view::<i8>()
-        .vortex_expect("failed to get seconds view");
-    let subseconds_view = subseconds_device
-        .cuda_view::<i8>()
-        .vortex_expect("failed to get subseconds view");
-    let output_view = output_device
-        .cuda_view::<i64>()
-        .vortex_expect("failed to get output view");
-
-    let array_len_u64 = output_len as u64;
-
-    let events = vortex_cuda::launch_cuda_kernel!(
-        execution_ctx: cuda_ctx,
-        module: "date_time_parts",
-        ptypes: &[PType::I16, PType::I8, PType::I8],
-        launch_args: [days_view, seconds_view, subseconds_view, divisor, output_view, array_len_u64],
-        event_recording: CU_EVENT_BLOCKING_SYNC,
-        array_len: output_len
-    );
-
-    events.duration()
-}
-
 fn benchmark_datetimeparts(c: &mut Criterion) {
     let mut group = c.benchmark_group("datetimeparts_cuda");
     group.sample_size(10);
@@ -139,19 +68,22 @@ fn benchmark_datetimeparts(c: &mut Criterion) {
             &dtp_array,
             |b, dtp_array| {
                 b.iter_custom(|iters| {
-                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
-                        .vortex_expect("failed to create execution context");
+                    let timed = TimedLaunchStrategy::default();
+                    let timer = Arc::clone(timed.get());
 
-                    let mut total_time = Duration::ZERO;
+                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+                        .vortex_expect("failed to create execution context")
+                        .with_launch_strategy(Arc::new(timed));
 
                     for _ in 0..iters {
-                        let kernel_time =
-                            launch_datetimeparts_kernel_timed(dtp_array, time_unit, &mut cuda_ctx)
-                                .vortex_expect("kernel launch failed");
-                        total_time += kernel_time;
+                        // block on immediately here
+                        block_on(
+                            DateTimePartsExecutor.execute(dtp_array.to_array(), &mut cuda_ctx),
+                        )
+                        .unwrap();
                     }
 
-                    total_time
+                    Duration::from_nanos(timer.load(Ordering::Relaxed))
                 });
             },
         );
diff --git a/vortex-cuda/benches/dict_cuda.rs b/vortex-cuda/benches/dict_cuda.rs
index c555d799a30..5c1ae658b38 100644
--- a/vortex-cuda/benches/dict_cuda.rs
+++ b/vortex-cuda/benches/dict_cuda.rs
@@ -6,30 +6,34 @@
 #![allow(clippy::unwrap_used)]
 #![allow(clippy::cast_possible_truncation)]
 
+mod common;
+
 use std::mem::size_of;
+use std::sync::Arc;
+use std::sync::atomic::Ordering;
 use std::time::Duration;
 
 use criterion::BenchmarkId;
 use criterion::Criterion;
 use criterion::Throughput;
 use cudarc::driver::DeviceRepr;
-use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC;
 use futures::executor::block_on;
 use vortex_array::IntoArray;
 use vortex_array::arrays::DictArray;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::validity::Validity::NonNullable;
 use vortex_buffer::Buffer;
-use vortex_cuda::CudaBufferExt;
-use vortex_cuda::CudaDeviceBuffer;
-use vortex_cuda::CudaExecutionCtx;
 use vortex_cuda::CudaSession;
+use vortex_cuda::DictExecutor;
+use vortex_cuda::executor::CudaExecute;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 use vortex_dtype::NativePType;
 use vortex_error::VortexExpect;
 use vortex_session::VortexSession;
 
+use crate::common::TimedLaunchStrategy;
+
 const BENCH_ARGS: &[(usize, &str)] = &[(10_000_000, "10M")];
 
 /// Configuration for a dictionary benchmark specifying value and code types along with dictionary size.
@@ -40,7 +44,7 @@ struct DictBenchConfig {
 }
 
 /// Creates a Dict array with parameterized value type V and code type C.
-fn make_dict_array_typed<V, C>(len: usize, dict_size: usize) -> (DictArray, Vec<V>, Vec<C>)
+fn make_dict_array_typed<V, C>(len: usize, dict_size: usize) -> DictArray
 where
     V: NativePType + From<u32>,
     C: NativePType + TryFrom<usize>,
@@ -50,62 +54,16 @@ where
     let values: Vec<V> = (0..dict_size)
         .map(|i| <V as From<u32>>::from((i * 1000) as u32))
         .collect();
-    let values_array = PrimitiveArray::new(Buffer::from(values.clone()), NonNullable);
+    let values_array = PrimitiveArray::new(Buffer::from(values), NonNullable);
 
     // Codes cycling through all dictionary values
     let codes: Vec<C> = (0..len)
         .map(|i| C::try_from(i % dict_size).unwrap())
         .collect();
-    let codes_array = PrimitiveArray::new(Buffer::from(codes.clone()), NonNullable);
-
-    let dict_array = DictArray::try_new(codes_array.into_array(), values_array.into_array())
-        .vortex_expect("failed to create Dict array");
+    let codes_array = PrimitiveArray::new(Buffer::from(codes), NonNullable);
 
-    (dict_array, values, codes)
-}
-
-/// Launches Dict decompression kernel and returns elapsed GPU time.
-fn launch_dict_kernel_timed_typed<V, C>(
-    values: &[V],
-    codes: &[C],
-    output_len: usize,
-    cuda_ctx: &mut CudaExecutionCtx,
-) -> vortex_error::VortexResult<Duration>
-where
-    V: NativePType + DeviceRepr,
-    C: NativePType + DeviceRepr,
-{
-    let values_device = block_on(cuda_ctx.copy_to_device(values.to_vec()).unwrap())
-        .vortex_expect("failed to copy values to device");
-
-    let codes_device = block_on(cuda_ctx.copy_to_device(codes.to_vec()).unwrap())
-        .vortex_expect("failed to copy codes to device");
-
-    let output_slice = cuda_ctx
-        .device_alloc::<V>(output_len)
-        .vortex_expect("failed to allocate output");
-    let output_device = CudaDeviceBuffer::new(output_slice);
-
-    let codes_view = codes_device
-        .cuda_view::<C>()
-        .vortex_expect("failed to get codes view");
-    let values_view = values_device
-        .cuda_view::<V>()
-        .vortex_expect("failed to get values view");
-    let output_view = output_device.as_view::<V>();
-
-    let codes_len_u64 = output_len as u64;
-
-    let events = vortex_cuda::launch_cuda_kernel!(
-        execution_ctx: cuda_ctx,
-        module: "dict",
-        ptypes: &[V::PTYPE, C::PTYPE],
-        launch_args: [codes_view, codes_len_u64, values_view, output_view],
-        event_recording: CU_EVENT_BLOCKING_SYNC,
-        array_len: output_len
-    );
-
-    events.duration()
+    DictArray::try_new(codes_array.into_array(), values_array.into_array())
+        .vortex_expect("failed to create Dict array")
 }
 
 /// Benchmark Dict decompression for specific value and code types.
@@ -122,7 +80,7 @@ where
         // Throughput is based on output size (values read from dictionary)
         group.throughput(Throughput::Bytes((len * size_of::<V>()) as u64));
 
-        let (dict_array, values, codes) = make_dict_array_typed::<V, C>(*len, config.dict_size);
+        let dict_array = make_dict_array_typed::<V, C>(*len, config.dict_size);
 
         group.bench_with_input(
             BenchmarkId::new(
@@ -132,26 +90,22 @@ where
                     config.value_type_name, config.code_type_name
                 ),
             ),
-            &(dict_array, values, codes),
-            |b, (dict_array, values, codes)| {
+            &dict_array,
+            |b, dict_array| {
                 b.iter_custom(|iters| {
-                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
-                        .vortex_expect("failed to create execution context");
+                    let timed = TimedLaunchStrategy::default();
+                    let timer = Arc::clone(timed.get());
 
-                    let mut total_time = Duration::ZERO;
+                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+                        .vortex_expect("failed to create execution context")
+                        .with_launch_strategy(Arc::new(timed));
 
                     for _ in 0..iters {
-                        let kernel_time = launch_dict_kernel_timed_typed::<V, C>(
-                            values,
-                            codes,
-                            dict_array.len(),
-                            &mut cuda_ctx,
-                        )
-                        .vortex_expect("kernel launch failed");
-                        total_time += kernel_time;
+                        block_on(DictExecutor.execute(dict_array.to_array(), &mut cuda_ctx))
+                            .vortex_expect("execute");
                     }
 
-                    total_time
+                    Duration::from_nanos(timer.load(Ordering::Relaxed))
                 });
             },
         );
diff --git a/vortex-cuda/benches/for_cuda.rs b/vortex-cuda/benches/for_cuda.rs
index dcd76d9ea11..56b50486750 100644
--- a/vortex-cuda/benches/for_cuda.rs
+++ b/vortex-cuda/benches/for_cuda.rs
@@ -6,118 +6,135 @@
 #![allow(clippy::unwrap_used)]
 #![allow(clippy::cast_possible_truncation)]
 
+mod common;
+
 use std::mem::size_of;
 use std::ops::Add;
+use std::sync::Arc;
+use std::sync::atomic::Ordering;
 use std::time::Duration;
 
 use criterion::BenchmarkId;
 use criterion::Criterion;
 use criterion::Throughput;
 use cudarc::driver::DeviceRepr;
-use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC;
 use futures::executor::block_on;
 use vortex_array::IntoArray;
-use vortex_array::ToCanonical;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::validity::Validity;
 use vortex_buffer::Buffer;
-use vortex_cuda::CudaBufferExt;
-use vortex_cuda::CudaExecutionCtx;
 use vortex_cuda::CudaSession;
+use vortex_cuda::FoRExecutor;
+use vortex_cuda::executor::CudaExecute;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 use vortex_dtype::NativePType;
+use vortex_dtype::PType;
 use vortex_error::VortexExpect;
+use vortex_fastlanes::BitPackedArray;
 use vortex_fastlanes::FoRArray;
 use vortex_scalar::Scalar;
 use vortex_session::VortexSession;
 
+use crate::common::TimedLaunchStrategy;
+
 const BENCH_ARGS: &[(usize, &str)] = &[(10_000_000, "10M")];
 const REFERENCE_VALUE: u8 = 10;
 
 /// Creates a FoR array with the specified type and length.
-fn make_for_array_typed<T>(len: usize) -> FoRArray
+fn make_for_array_typed<T>(len: usize, bp: bool) -> FoRArray
 where
     T: NativePType + From<u8> + Add<Output = T>,
     Scalar: From<T>,
 {
     let reference = <T as From<u8>>::from(REFERENCE_VALUE);
     let data: Vec<T> = (0..len)
-        .map(|i| <T as From<u8>>::from((i % 256) as u8) + reference)
+        .map(|i| <T as From<u8>>::from((i % 256) as u8))
         .collect();
 
     let primitive_array =
         PrimitiveArray::new(Buffer::from(data), Validity::NonNullable).into_array();
 
-    FoRArray::try_new(primitive_array, reference.into()).vortex_expect("failed to create FoR array")
+    if bp && T::PTYPE != PType::U8 {
+        let child =
+            BitPackedArray::encode(primitive_array.as_ref(), 8).vortex_expect("failed to bitpack");
+        FoRArray::try_new(child.into_array(), reference.into())
+            .vortex_expect("failed to create FoR array")
+    } else {
+        FoRArray::try_new(primitive_array, reference.into())
+            .vortex_expect("failed to create FoR array")
+    }
 }
 
-/// Launches FoR decompression kernel and returns elapsed GPU time.
-fn launch_for_kernel_timed_typed<T>(
-    for_array: &FoRArray,
-    cuda_ctx: &mut CudaExecutionCtx,
-) -> vortex_error::VortexResult<Duration>
+/// Benchmark FoR decompression for a specific type.
+fn benchmark_for_typed<T>(c: &mut Criterion, type_name: &str)
 where
-    T: NativePType + DeviceRepr + From<u8>,
+    T: NativePType + DeviceRepr + From<u8> + Add<Output = T>,
+    Scalar: From<T>,
 {
-    let encoded = for_array.encoded();
-    let unpacked_array = encoded.to_primitive();
-    let unpacked_slice = unpacked_array.as_slice::<T>();
+    let mut group = c.benchmark_group("for_cuda");
+    group.sample_size(10);
 
-    let device_data = block_on(cuda_ctx.copy_to_device(unpacked_slice.to_vec()).unwrap())
-        .vortex_expect("failed to copy to device");
+    for &(len, len_str) in BENCH_ARGS {
+        group.throughput(Throughput::Bytes((len * size_of::<T>()) as u64));
 
-    let reference = <T as From<u8>>::from(REFERENCE_VALUE);
-    let array_len_u64 = for_array.len() as u64;
-
-    let device_view = device_data
-        .cuda_view::<T>()
-        .vortex_expect("failed to get device view");
-
-    let events = vortex_cuda::launch_cuda_kernel!(
-        execution_ctx: cuda_ctx,
-        module: "for",
-        ptypes: &[for_array.ptype()],
-        launch_args: [device_view, reference, array_len_u64],
-        event_recording: CU_EVENT_BLOCKING_SYNC,
-        array_len: for_array.len()
-    );
-
-    events.duration()
+        let for_array = make_for_array_typed::<T>(len, false);
+
+        group.bench_with_input(
+            BenchmarkId::new("for", format!("{len_str}_{type_name}")),
+            &for_array,
+            |b, for_array| {
+                b.iter_custom(|iters| {
+                    let timed = TimedLaunchStrategy::default();
+                    let timer = Arc::clone(timed.get());
+
+                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+                        .vortex_expect("failed to create execution context")
+                        .with_launch_strategy(Arc::new(timed));
+
+                    for _ in 0..iters {
+                        block_on(FoRExecutor.execute(for_array.to_array(), &mut cuda_ctx)).unwrap();
+                    }
+
+                    Duration::from_nanos(timer.load(Ordering::Relaxed))
+                });
+            },
+        );
+    }
+
+    group.finish();
 }
 
-/// Benchmark FoR decompression for a specific type.
-fn benchmark_for_typed<T>(c: &mut Criterion, type_name: &str)
+fn benchmark_ffor_typed<T>(c: &mut Criterion, type_name: &str)
 where
     T: NativePType + DeviceRepr + From<u8> + Add<Output = T>,
     Scalar: From<T>,
 {
-    let mut group = c.benchmark_group("for_cuda");
+    let mut group = c.benchmark_group("ffor_cuda");
     group.sample_size(10);
 
-    for (len, len_str) in BENCH_ARGS {
+    for &(len, len_str) in BENCH_ARGS {
         group.throughput(Throughput::Bytes((len * size_of::<T>()) as u64));
 
-        let for_array = make_for_array_typed::<T>(*len);
+        let for_array = make_for_array_typed::<T>(len, true);
 
         group.bench_with_input(
             BenchmarkId::new("for", format!("{len_str}_{type_name}")),
             &for_array,
             |b, for_array| {
                 b.iter_custom(|iters| {
-                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
-                        .vortex_expect("failed to create execution context");
+                    let timed = TimedLaunchStrategy::default();
+                    let timer = Arc::clone(timed.get());
 
-                    let mut total_time = Duration::ZERO;
+                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+                        .vortex_expect("failed to create execution context")
+                        .with_launch_strategy(Arc::new(timed));
 
                     for _ in 0..iters {
-                        let kernel_time =
-                            launch_for_kernel_timed_typed::<T>(for_array, &mut cuda_ctx)
-                                .vortex_expect("kernel launch failed");
-                        total_time += kernel_time;
+                        block_on(FoRExecutor.execute(for_array.to_array(), &mut cuda_ctx)).unwrap();
                     }
 
-                    total_time
+                    Duration::from_nanos(timer.load(Ordering::Relaxed))
                 });
             },
         );
@@ -134,7 +151,15 @@ fn benchmark_for(c: &mut Criterion) {
     benchmark_for_typed::<u64>(c, "u64");
 }
 
-criterion::criterion_group!(benches, benchmark_for);
+/// Benchmark FOR+BP decompression for all types.
+fn benchmark_ffor(c: &mut Criterion) {
+    benchmark_ffor_typed::<u8>(c, "u8");
+    benchmark_ffor_typed::<u16>(c, "u16");
+    benchmark_ffor_typed::<u32>(c, "u32");
+    benchmark_ffor_typed::<u64>(c, "u64");
+}
+
+criterion::criterion_group!(benches, benchmark_for, benchmark_ffor);
 
 #[cuda_available]
 criterion::criterion_main!(benches);
diff --git a/vortex-cuda/benches/runend_cuda.rs b/vortex-cuda/benches/runend_cuda.rs
index cb7b1effbb5..0ce1a37e11b 100644
--- a/vortex-cuda/benches/runend_cuda.rs
+++ b/vortex-cuda/benches/runend_cuda.rs
@@ -6,31 +6,33 @@
 #![allow(clippy::unwrap_used)]
 #![allow(clippy::cast_possible_truncation)]
 
+mod common;
+
 use std::mem::size_of;
+use std::sync::Arc;
+use std::sync::atomic::Ordering;
 use std::time::Duration;
 
 use criterion::BenchmarkId;
 use criterion::Criterion;
 use criterion::Throughput;
 use cudarc::driver::DeviceRepr;
-use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC;
 use futures::executor::block_on;
 use vortex_array::IntoArray;
-use vortex_array::ToCanonical;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::validity::Validity;
 use vortex_buffer::Buffer;
-use vortex_cuda::CudaBufferExt;
-use vortex_cuda::CudaExecutionCtx;
 use vortex_cuda::CudaSession;
+use vortex_cuda::RunEndExecutor;
+use vortex_cuda::executor::CudaExecute;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 use vortex_dtype::NativePType;
-use vortex_dtype::PType;
-use vortex_error::VortexExpect;
 use vortex_runend::RunEndArray;
 use vortex_session::VortexSession;
 
+use crate::common::TimedLaunchStrategy;
+
 /// Creates a run-end encoded array with the specified output length and average run length.
 fn make_runend_array_typed<T>(output_len: usize, avg_run_len: usize) -> RunEndArray
 where
@@ -56,64 +58,6 @@ where
     RunEndArray::new(ends_array, values_array)
 }
 
-/// Launches runend decode kernel and returns elapsed GPU time.
-fn launch_runend_kernel_timed_typed<T>(
-    runend_array: &RunEndArray,
-    cuda_ctx: &mut CudaExecutionCtx,
-) -> vortex_error::VortexResult<Duration>
-where
-    T: NativePType + DeviceRepr,
-{
-    let ends_prim = runend_array.ends().to_primitive();
-    let values_prim = runend_array.values().to_primitive();
-
-    let output_len = runend_array.len();
-    let num_runs = ends_prim.len();
-    let offset = runend_array.offset();
-
-    let ends_device = block_on(
-        cuda_ctx
-            .copy_to_device(ends_prim.as_slice::<u64>().to_vec())
-            .unwrap(),
-    )
-    .vortex_expect("failed to copy ends to device");
-
-    let values_device = block_on(
-        cuda_ctx
-            .copy_to_device(values_prim.as_slice::<T>().to_vec())
-            .unwrap(),
-    )
-    .vortex_expect("failed to copy values to device");
-
-    let output_device = block_on(
-        cuda_ctx
-            .copy_to_device(vec![T::default(); output_len])
-            .unwrap(),
-    )
-    .vortex_expect("failed to allocate output buffer");
-
-    let ends_view = ends_device
-        .cuda_view::<u64>()
-        .vortex_expect("failed to get ends view");
-    let values_view = values_device
-        .cuda_view::<T>()
-        .vortex_expect("failed to get values view");
-    let output_view = output_device
-        .cuda_view::<T>()
-        .vortex_expect("failed to get output view");
-
-    let events = vortex_cuda::launch_cuda_kernel!(
-        execution_ctx: cuda_ctx,
-        module: "runend",
-        ptypes: &[T::PTYPE, PType::U64],
-        launch_args: [ends_view, num_runs, values_view, offset, output_len, output_view],
-        event_recording: CU_EVENT_BLOCKING_SYNC,
-        array_len: output_len
-    );
-
-    events.duration()
-}
-
 /// Benchmark run-end decoding for a specific type with varying run lengths
 fn benchmark_runend_typed<T>(c: &mut Criterion, type_name: &str)
 where
@@ -137,20 +81,22 @@ where
                 &runend_array,
                 |b, runend_array| {
                     b.iter_custom(|iters| {
+                        let timed = TimedLaunchStrategy::default();
+                        let timer = Arc::clone(timed.get());
+
                         let mut cuda_ctx =
                             CudaSession::create_execution_ctx(&VortexSession::empty())
-                                .vortex_expect("failed to create execution context");
-
-                        let mut total_time = Duration::ZERO;
+                                .unwrap()
+                                .with_launch_strategy(Arc::new(timed));
 
                         for _ in 0..iters {
-                            let kernel_time =
-                                launch_runend_kernel_timed_typed::<T>(runend_array, &mut cuda_ctx)
-                                    .vortex_expect("kernel launch failed");
-                            total_time += kernel_time;
+                            block_on(
+                                RunEndExecutor.execute(runend_array.to_array(), &mut cuda_ctx),
+                            )
+                            .unwrap();
                         }
 
-                        total_time
+                        Duration::from_nanos(timer.load(Ordering::Relaxed))
                     });
                 },
             );
diff --git a/vortex-cuda/src/executor.rs b/vortex-cuda/src/executor.rs
index 31f66f3ecb8..2afa5bd048d 100644
--- a/vortex-cuda/src/executor.rs
+++ b/vortex-cuda/src/executor.rs
@@ -12,6 +12,7 @@ use cudarc::driver::CudaSlice;
 use cudarc::driver::CudaStream;
 use cudarc::driver::DeviceRepr;
 use cudarc::driver::LaunchArgs;
+use cudarc::driver::LaunchConfig;
 use futures::future::BoxFuture;
 use vortex_array::Array;
 use vortex_array::ArrayRef;
@@ -28,8 +29,14 @@ use vortex_error::vortex_err;
 
 use crate::CudaSession;
 use crate::ExportDeviceArray;
+use crate::debug;
+use crate::kernel::DefaultLaunchStrategy;
+use crate::kernel::LaunchStrategy;
+use crate::kernel::launch_cuda_kernel_impl;
+use crate::kernel::launch_cuda_kernel_with_config;
 use crate::session::CudaSessionExt;
 use crate::stream::VortexCudaStream;
+use crate::trace;
 
 /// CUDA kernel events recorded before and after kernel launch.
 #[derive(Debug)]
@@ -57,6 +64,7 @@ pub struct CudaExecutionCtx {
     stream: VortexCudaStream,
     ctx: ExecutionCtx,
     cuda_session: CudaSession,
+    strategy: Arc<dyn LaunchStrategy>,
 }
 
 impl CudaExecutionCtx {
@@ -67,9 +75,68 @@ impl CudaExecutionCtx {
             stream,
             ctx,
             cuda_session,
+            strategy: Arc::new(DefaultLaunchStrategy),
         }
     }
 
+    /// Set the launch strategy for the execution context.
+    ///
+    /// This can only be set on setup (an "owned" context) and not from within
+    /// a kernel execution.
+    pub fn with_launch_strategy(mut self, launch_strategy: Arc<dyn LaunchStrategy>) -> Self {
+        self.strategy = launch_strategy;
+        self
+    }
+
+    /// Launch a Kernel function with args setup done by the provided `build_args` closure.
+    ///
+    /// Kernels launched this way will use the default launch configuration, which provides no
+    /// shared memory bytes, and uses grid parameters based on the ideal thread block size for
+    /// the given `len`.
+    pub fn launch_kernel<'a, F>(
+        &'a mut self,
+        function: &'a CudaFunction,
+        len: usize,
+        build_args: F,
+    ) -> VortexResult<()>
+    where
+        F: FnOnce(&mut LaunchArgs<'a>),
+    {
+        let mut launcher = self.launch_builder(function);
+        build_args(&mut launcher);
+
+        let events = launch_cuda_kernel_impl(&mut launcher, self.strategy.event_flags(), len)?;
+        self.strategy.on_complete(&events, len)?;
+
+        drop(events);
+
+        Ok(())
+    }
+
+    /// Launch a function with args provided by the `build_args` closure, with an explicit
+    /// [`LaunchConfig`], for kernels which need specific grid and shared memory configuration.
+    pub fn launch_kernel_config<'a, F>(
+        &'a mut self,
+        function: &'a CudaFunction,
+        cfg: LaunchConfig,
+        len: usize,
+        build_args: F,
+    ) -> VortexResult<()>
+    where
+        F: FnOnce(&mut LaunchArgs<'a>),
+    {
+        let mut launcher = self.launch_builder(function);
+        build_args(&mut launcher);
+
+        let events =
+            launch_cuda_kernel_with_config(&mut launcher, cfg, self.strategy.event_flags())?;
+        self.strategy.on_complete(&events, len)?;
+
+        drop(events);
+
+        Ok(())
+    }
+
     /// Loads a CUDA kernel function by module name and ptype(s).
     ///
     /// # Arguments
@@ -235,18 +302,19 @@ impl CudaArrayExt for ArrayRef {
         }
 
         if self.is_canonical() || self.is_empty() {
+            trace!(encoding = ?self.encoding_id(), "skipping canonical");
             return self.execute(&mut ctx.ctx);
         }
 
         let Some(support) = ctx.cuda_session.kernel(&self.encoding_id()) else {
-            tracing::debug!(
+            debug!(
                 encoding = %self.encoding_id(),
                 "No CUDA support registered for encoding, falling back to CPU execution"
             );
             return self.execute(&mut ctx.ctx);
         };
 
-        tracing::debug!(
+        debug!(
             encoding = %self.encoding_id(),
             "Executing array on CUDA device"
         );
diff --git a/vortex-cuda/src/kernel/arrays/constant.rs b/vortex-cuda/src/kernel/arrays/constant.rs
index f38784e3af7..b9acb133d07 100644
--- a/vortex-cuda/src/kernel/arrays/constant.rs
+++ b/vortex-cuda/src/kernel/arrays/constant.rs
@@ -7,7 +7,6 @@ use std::sync::Arc;
 use async_trait::async_trait;
 use cudarc::driver::DeviceRepr;
 use cudarc::driver::PushKernelArg;
-use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
 use vortex_array::arrays::ConstantArray;
@@ -31,13 +30,13 @@ use vortex_error::vortex_err;
 use crate::CudaDeviceBuffer;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
-use crate::launch_cuda_kernel_impl;
 
 /// CUDA executor for constant arrays with numeric types.
 ///
 /// Materializes a constant array by filling a device buffer with the scalar value.
 /// Supports primitive types (integers, floats) and decimal types (i128, i256).
 #[derive(Debug)]
+#[doc(hidden)]
 pub struct ConstantNumericExecutor;
 
 impl ConstantNumericExecutor {
@@ -48,6 +47,10 @@ impl ConstantNumericExecutor {
 
 #[async_trait]
 impl CudaExecute for ConstantNumericExecutor {
+    #[cfg_attr(
+        feature = "tracing",
+        tracing::instrument(level = "trace", skip_all, fields(self))
+    )]
     async fn execute(
         &self,
         array: ArrayRef,
@@ -114,16 +117,12 @@ where
     // Load kernel function
     let kernel_ptypes = [P::PTYPE];
     let cuda_function = ctx.load_function_ptype("constant_numeric", &kernel_ptypes)?;
-    let mut launch_builder = ctx.launch_builder(&cuda_function);
 
-    // Build launch args: output, value, length
-    launch_builder.arg(&output_view);
-    launch_builder.arg(&value);
-    launch_builder.arg(&array_len_u64);
-
-    // Launch kernel
-    let _cuda_events =
-        launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, array_len)?;
+    ctx.launch_kernel(&cuda_function, array_len, |args| {
+        args.arg(&output_view);
+        args.arg(&value);
+        args.arg(&array_len_u64);
+    })?;
 
     // Wrap the CudaSlice in a CudaDeviceBuffer and then BufferHandle
     let device_buffer = CudaDeviceBuffer::new(output_buffer);
@@ -174,16 +173,12 @@ where
 
     // Load kernel function
     let cuda_function = ctx.load_function("constant_numeric", &[&D::DECIMAL_TYPE.to_string()])?;
-    let mut launch_builder = ctx.launch_builder(&cuda_function);
-
-    // Build launch args: output, value, length
-    launch_builder.arg(&output_view);
-    launch_builder.arg(&value);
-    launch_builder.arg(&array_len_u64);
 
-    // Launch kernel
-    let _cuda_events =
-        launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, array_len)?;
+    ctx.launch_kernel(&cuda_function, array_len, |args| {
+        args.arg(&output_view);
+        args.arg(&value);
+        args.arg(&array_len_u64);
+    })?;
 
     // Wrap the CudaSlice in a CudaDeviceBuffer and then BufferHandle
     let device_buffer = CudaDeviceBuffer::new(output_buffer);
diff --git a/vortex-cuda/src/kernel/arrays/dict.rs b/vortex-cuda/src/kernel/arrays/dict.rs
index 4a9f2e07a8b..3aae120ac72 100644
--- a/vortex-cuda/src/kernel/arrays/dict.rs
+++ b/vortex-cuda/src/kernel/arrays/dict.rs
@@ -34,14 +34,18 @@ use crate::CudaDeviceBuffer;
 use crate::executor::CudaArrayExt;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
-use crate::launch_cuda_kernel_impl;
 
 /// CUDA executor for dictionary-encoded arrays.
 #[derive(Debug)]
+#[doc(hidden)]
 pub struct DictExecutor;
 
 #[async_trait]
 impl CudaExecute for DictExecutor {
+    #[cfg_attr(
+        feature = "tracing",
+        tracing::instrument(level = "trace", skip_all, fields(self))
+    )]
     async fn execute(
         &self,
         array: ArrayRef,
@@ -118,15 +122,14 @@ async fn execute_dict_prim_typed<V: DeviceRepr + NativePType, I: DeviceRepr + Na
     let output_view = output_device.as_view::<V>();
 
     let codes_len_u64 = codes_len as u64;
-    // Launch the dict kernel
-    let _cuda_events = crate::launch_cuda_kernel!(
-        execution_ctx: ctx,
-        module: "dict",
-        ptypes: &[value_ptype, I::PTYPE],
-        launch_args: [codes_view, codes_len_u64, values_view, output_view],
-        event_recording: cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING,
-        array_len: codes_len
-    );
+
+    let kernel_function = ctx.load_function_ptype("dict", &[value_ptype, I::PTYPE])?;
+    ctx.launch_kernel(&kernel_function, codes_len, |args| {
+        args.arg(&codes_view)
+            .arg(&codes_len_u64)
+            .arg(&values_view)
+            .arg(&output_view);
+    })?;
 
     Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle(
         BufferHandle::new_device(Arc::new(output_device)),
@@ -177,6 +180,7 @@ async fn execute_dict_decimal_typed<
 ) -> VortexResult<Canonical> {
     assert!(!codes.is_empty());
     let codes_len = codes.len();
+    let codes_len_u64 = codes_len as u64;
     if codes_len == 0 {
         vortex_bail!("Cannot execute dict on empty codes array");
     }
@@ -211,18 +215,13 @@ async fn execute_dict_decimal_typed<
         "dict",
         &[&V::DECIMAL_TYPE.to_string(), &C::PTYPE.to_string()],
     )?;
-    let mut launch_builder = ctx.launch_builder(&cuda_function);
 
-    launch_builder.arg(&codes_view);
-    launch_builder.arg(&codes_len);
-    launch_builder.arg(&values_view);
-    launch_builder.arg(&output_view);
-
-    let _cuda_events = launch_cuda_kernel_impl(
-        &mut launch_builder,
-        cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING,
-        codes_len,
-    )?;
+    ctx.launch_kernel(&cuda_function, codes_len, |args| {
+        args.arg(&codes_view)
+            .arg(&codes_len_u64)
+            .arg(&values_view)
+            .arg(&output_view);
+    })?;
 
     Ok(Canonical::Decimal(DecimalArray::new_handle(
         BufferHandle::new_device(Arc::new(output_device)),
@@ -283,19 +282,15 @@ async fn execute_dict_varbinview(
 
         let codes_ptype_str = C::PTYPE.to_string();
         let cuda_function = ctx.load_function("dict", &["i128", &codes_ptype_str])?;
-        let mut launch_builder = ctx.launch_builder(&cuda_function);
 
         let codes_len_u64 = codes_len as u64;
-        launch_builder.arg(&codes_view);
-        launch_builder.arg(&codes_len_u64);
-        launch_builder.arg(&values_view);
-        launch_builder.arg(&output_view);
-
-        let _cuda_events = launch_cuda_kernel_impl(
-            &mut launch_builder,
-            cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING,
-            codes_len,
-        )?;
+
+        ctx.launch_kernel(&cuda_function, codes_len, |args| {
+            args.arg(&codes_view);
+            args.arg(&codes_len_u64);
+            args.arg(&values_view);
+            args.arg(&output_view);
+        })?;
     });
 
     // Output views gathered by the kernel share the values' data buffers.
diff --git a/vortex-cuda/src/kernel/arrays/shared.rs b/vortex-cuda/src/kernel/arrays/shared.rs
index 8bd4e0e3f98..aba9f6ffb43 100644
--- a/vortex-cuda/src/kernel/arrays/shared.rs
+++ b/vortex-cuda/src/kernel/arrays/shared.rs
@@ -14,10 +14,15 @@ use crate::executor::CudaExecutionCtx;
 
 /// CUDA executor for SharedArray.
 #[derive(Debug)]
+#[doc(hidden)]
 pub struct SharedExecutor;
 
 #[async_trait]
 impl CudaExecute for SharedExecutor {
+    #[cfg_attr(
+        feature = "tracing",
+        tracing::instrument(level = "trace", skip_all, fields(self))
+    )]
     async fn execute(
         &self,
         array: ArrayRef,
diff --git a/vortex-cuda/src/kernel/encodings/alp.rs b/vortex-cuda/src/kernel/encodings/alp.rs
index bf5753a47cb..ac1bd225f16 100644
--- a/vortex-cuda/src/kernel/encodings/alp.rs
+++ b/vortex-cuda/src/kernel/encodings/alp.rs
@@ -7,7 +7,6 @@ use std::sync::Arc;
 use async_trait::async_trait;
 use cudarc::driver::DeviceRepr;
 use cudarc::driver::PushKernelArg;
-use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING;
 use vortex_alp::ALPArray;
 use vortex_alp::ALPFloat;
 use vortex_alp::ALPVTable;
@@ -31,14 +30,18 @@ use crate::executor::CudaArrayExt;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
 use crate::kernel::patches::execute_patches;
-use crate::launch_cuda_kernel_impl;
 
 /// CUDA decoder for ALP (Adaptive Lossless floating-Point) decompression.
 #[derive(Debug)]
+#[doc(hidden)]
 pub struct ALPExecutor;
 
 #[async_trait]
 impl CudaExecute for ALPExecutor {
+    #[cfg_attr(
+        feature = "tracing",
+        tracing::instrument(level = "trace", skip_all, fields(self))
+    )]
     async fn execute(
         &self,
         array: ArrayRef,
@@ -87,20 +90,14 @@ where
     // Load kernel function
     let kernel_ptypes = [A::ALPInt::PTYPE, A::PTYPE];
     let cuda_function = ctx.load_function_ptype("alp", &kernel_ptypes)?;
-    {
-        let mut launch_builder = ctx.launch_builder(&cuda_function);
-
-        // Build launch args: input, output, f, e, length
-        launch_builder.arg(&input_view);
-        launch_builder.arg(&output_view);
-        launch_builder.arg(&f);
-        launch_builder.arg(&e);
-        launch_builder.arg(&array_len_u64);
-
-        // Launch kernel
-        let _cuda_events =
-            launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, array_len)?;
-    }
+
+    ctx.launch_kernel(&cuda_function, array_len, |args| {
+        args.arg(&input_view)
+            .arg(&output_view)
+            .arg(&f)
+            .arg(&e)
+            .arg(&array_len_u64);
+    })?;
 
     // Check if there are any patches to decode here
     let output_buf = if let Some(patches) = array.patches() {
diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs
index 9f1d54e7209..bb115fe08ec 100644
--- a/vortex-cuda/src/kernel/encodings/bitpacked.rs
+++ b/vortex-cuda/src/kernel/encodings/bitpacked.rs
@@ -9,7 +9,6 @@ use cudarc::driver::CudaFunction;
 use cudarc::driver::DeviceRepr;
 use cudarc::driver::LaunchConfig;
 use cudarc::driver::PushKernelArg;
-use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
 use vortex_array::arrays::PrimitiveArray;
@@ -32,11 +31,11 @@ use crate::CudaBufferExt;
 use crate::CudaDeviceBuffer;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
-use crate::kernel::launch_cuda_kernel_with_config;
 use crate::kernel::patches::execute_patches;
 
 /// CUDA decoder for bit-packed arrays.
 #[derive(Debug)]
+#[doc(hidden)]
 pub struct BitPackedExecutor;
 
 impl BitPackedExecutor {
@@ -47,6 +46,10 @@ impl BitPackedExecutor {
 
 #[async_trait]
 impl CudaExecute for BitPackedExecutor {
+    #[cfg_attr(
+        feature = "tracing",
+        tracing::instrument(level = "trace", skip_all, fields(self))
+    )]
     async fn execute(
         &self,
         array: ArrayRef,
@@ -87,7 +90,7 @@ pub fn bitpacked_cuda_launch_config(output_width: usize, len: usize) -> VortexRe
     })
 }
 
-async fn decode_bitpacked<A>(
+pub(crate) async fn decode_bitpacked<A>(
     array: BitPackedArray,
     ctx: &mut CudaExecutionCtx,
 ) -> VortexResult<Canonical>
@@ -119,18 +122,11 @@ where
 
     let output_width = size_of::<A>() * 8;
     let cuda_function = bitpacked_cuda_kernel(bit_width, output_width, ctx)?;
+    let config = bitpacked_cuda_launch_config(output_width, len)?;
 
-    {
-        let mut launch_builder = ctx.launch_builder(&cuda_function);
-
-        launch_builder.arg(&input_view);
-        launch_builder.arg(&output_view);
-
-        let config = bitpacked_cuda_launch_config(output_width, len)?;
-
-        let _cuda_events =
-            launch_cuda_kernel_with_config(&mut launch_builder, config, CU_EVENT_DISABLE_TIMING)?;
-    }
+    ctx.launch_kernel_config(&cuda_function, config, len, |args| {
+        args.arg(&input_view).arg(&output_view);
+    })?;
 
     let output_handle = match patches {
         None => BufferHandle::new_device(output_buf.slice_typed::<A>(offset..(offset + len))),
diff --git a/vortex-cuda/src/kernel/encodings/date_time_parts.rs b/vortex-cuda/src/kernel/encodings/date_time_parts.rs
index 6bad3309301..154802546ec 100644
--- a/vortex-cuda/src/kernel/encodings/date_time_parts.rs
+++ b/vortex-cuda/src/kernel/encodings/date_time_parts.rs
@@ -6,7 +6,6 @@ use std::sync::Arc;
 use async_trait::async_trait;
 use cudarc::driver::DeviceRepr;
 use cudarc::driver::PushKernelArg;
-use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
 use vortex_array::IntoArray;
@@ -35,16 +34,20 @@ use crate::CudaDeviceBuffer;
 use crate::executor::CudaArrayExt;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
-use crate::launch_cuda_kernel_impl;
 
 /// CUDA executor for DateTimeParts arrays.
 ///
 /// Combines the days, seconds, and subseconds components into a single i64 timestamp array.
 #[derive(Debug)]
+#[doc(hidden)]
 pub struct DateTimePartsExecutor;
 
 #[async_trait]
 impl CudaExecute for DateTimePartsExecutor {
+    #[cfg_attr(
+        feature = "tracing",
+        tracing::instrument(level = "trace", skip_all, fields(self))
+    )]
     async fn execute(
         &self,
         array: ArrayRef,
@@ -182,18 +185,17 @@ where
     ];
     let kernel_suffix_strs: Vec<&str> = kernel_suffixes.iter().map(|s| s.as_str()).collect();
     let cuda_function = ctx.load_function("date_time_parts", &kernel_suffix_strs)?;
-    let mut launch_builder = ctx.launch_builder(&cuda_function);
 
-    launch_builder.arg(&days_view);
-    launch_builder.arg(&seconds_view);
-    launch_builder.arg(&subseconds_view);
-    launch_builder.arg(&divisor);
-    launch_builder.arg(&output_view);
     let array_len_u64 = output_len as u64;
-    launch_builder.arg(&array_len_u64);
 
-    let _cuda_events =
-        launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, output_len)?;
+    ctx.launch_kernel(&cuda_function, output_len, |args| {
+        args.arg(&days_view)
+            .arg(&seconds_view)
+            .arg(&subseconds_view)
+            .arg(&divisor)
+            .arg(&output_view)
+            .arg(&array_len_u64);
+    })?;
 
     let output_buffer = BufferHandle::new_device(Arc::new(output_device));
     let output_primitive = PrimitiveArray::from_buffer_handle(output_buffer, PType::I64, validity);
diff --git a/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs b/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs
index 57472a1eb5e..37ac1a3e2ed 100644
--- a/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs
+++ b/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs
@@ -20,10 +20,15 @@ use crate::executor::CudaExecute;
 
 // See `DecimalBytePartsArray`
 #[derive(Debug)]
+#[doc(hidden)]
 pub struct DecimalBytePartsExecutor;
 
 #[async_trait]
 impl CudaExecute for DecimalBytePartsExecutor {
+    #[cfg_attr(
+        feature = "tracing",
+        tracing::instrument(level = "trace", skip_all, fields(self))
+    )]
     async fn execute(
         &self,
         array: ArrayRef,
diff --git a/vortex-cuda/src/kernel/encodings/for_.rs b/vortex-cuda/src/kernel/encodings/for_.rs
index 9eab2c53de4..4a09d07071f 100644
--- a/vortex-cuda/src/kernel/encodings/for_.rs
+++ b/vortex-cuda/src/kernel/encodings/for_.rs
@@ -6,7 +6,7 @@ use std::fmt::Debug;
 use async_trait::async_trait;
 use cudarc::driver::DeviceRepr;
 use cudarc::driver::PushKernelArg;
-use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING;
+use vortex_array::Array;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
 use vortex_array::arrays::PrimitiveArray;
@@ -25,10 +25,10 @@ use crate::CudaBufferExt;
 use crate::executor::CudaArrayExt;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
-use crate::launch_cuda_kernel_impl;
 
 /// CUDA decoder for frame-of-reference.
 #[derive(Debug)]
+#[doc(hidden)]
 pub struct FoRExecutor;
 
 impl FoRExecutor {
@@ -39,6 +39,10 @@ impl FoRExecutor {
 
 #[async_trait]
 impl CudaExecute for FoRExecutor {
+    #[cfg_attr(
+        feature = "tracing",
+        tracing::instrument(level = "trace", skip_all, fields(self))
+    )]
     async fn execute(
         &self,
         array: ArrayRef,
@@ -79,16 +83,10 @@ where
     // Load kernel function
     let kernel_ptypes = [P::PTYPE];
     let cuda_function = ctx.load_function_ptype("for", &kernel_ptypes)?;
-    let mut launch_builder = ctx.launch_builder(&cuda_function);
 
-    // Build launch args: buffer, reference, length
-    launch_builder.arg(&cuda_view);
-    launch_builder.arg(&reference);
-    launch_builder.arg(&array_len_u64);
-
-    // Launch kernel
-    let _cuda_events =
-        launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, array_len)?;
+    ctx.launch_kernel(&cuda_function, array_len, |args| {
+        args.arg(&cuda_view).arg(&reference).arg(&array_len_u64);
+    })?;
 
     // Build result - in-place reuses the same buffer
     Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle(
diff --git a/vortex-cuda/src/kernel/encodings/runend.rs b/vortex-cuda/src/kernel/encodings/runend.rs
index 846131e3861..29a0098034a 100644
--- a/vortex-cuda/src/kernel/encodings/runend.rs
+++ b/vortex-cuda/src/kernel/encodings/runend.rs
@@ -6,7 +6,6 @@ use std::sync::Arc;
 use async_trait::async_trait;
 use cudarc::driver::DeviceRepr;
 use cudarc::driver::PushKernelArg;
-use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
 use vortex_array::arrays::ConstantArray;
@@ -33,10 +32,10 @@ use crate::CudaDeviceBuffer;
 use crate::executor::CudaArrayExt;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
-use crate::launch_cuda_kernel_impl;
 
 /// CUDA executor for run-end encoded arrays.
 #[derive(Debug)]
+#[doc(hidden)]
 pub struct RunEndExecutor;
 
 impl RunEndExecutor {
@@ -47,6 +46,10 @@ impl RunEndExecutor {
 
 #[async_trait]
 impl CudaExecute for RunEndExecutor {
+    #[cfg_attr(
+        feature = "tracing",
+        tracing::instrument(level = "trace", skip_all, fields(self))
+    )]
     async fn execute(
         &self,
         array: ArrayRef,
@@ -131,18 +134,15 @@ async fn decode_runend_typed<V: DeviceRepr + NativePType, E: DeviceRepr + Native
     let kernel_ptypes = [value_ptype.to_string(), E::PTYPE.to_string()];
     let kernel_ptype_strs: Vec<&str> = kernel_ptypes.iter().map(|s| s.as_str()).collect();
     let cuda_function = ctx.load_function("runend", &kernel_ptype_strs)?;
-    let mut launch_builder = ctx.launch_builder(&cuda_function);
-
-    launch_builder.arg(&ends_view);
-    launch_builder.arg(&num_runs);
-    launch_builder.arg(&values_view);
-    launch_builder.arg(&offset);
-    launch_builder.arg(&output_len);
-    launch_builder.arg(&output_view);
-
-    // Launch kernel
-    let _cuda_events =
-        launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, output_len)?;
+
+    ctx.launch_kernel(&cuda_function, output_len, |args| {
+        args.arg(&ends_view)
+            .arg(&num_runs)
+            .arg(&values_view)
+            .arg(&offset)
+            .arg(&output_len)
+            .arg(&output_view);
+    })?;
 
     let output_validity = match values_validity {
         Validity::NonNullable => Validity::NonNullable,
diff --git a/vortex-cuda/src/kernel/encodings/sequence.rs b/vortex-cuda/src/kernel/encodings/sequence.rs
index daf08bb9654..0a7de984f47 100644
--- a/vortex-cuda/src/kernel/encodings/sequence.rs
+++ b/vortex-cuda/src/kernel/encodings/sequence.rs
@@ -5,7 +5,7 @@ use std::sync::Arc;
 
 use async_trait::async_trait;
 use cudarc::driver::DeviceRepr;
-use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING;
+use cudarc::driver::PushKernelArg;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
 use vortex_array::arrays::PrimitiveArray;
@@ -22,7 +22,6 @@ use vortex_sequence::SequenceVTable;
 use crate::CudaDeviceBuffer;
 use crate::CudaExecutionCtx;
 use crate::executor::CudaExecute;
-use crate::launch_cuda_kernel;
 
 /// CUDA execution for `SequenceArray`.
 #[derive(Debug)]
@@ -30,6 +29,10 @@ pub struct SequenceExecutor;
 
 #[async_trait]
 impl CudaExecute for SequenceExecutor {
+    #[cfg_attr(
+        feature = "tracing",
+        tracing::instrument(level = "trace", skip_all, fields(self))
+    )]
     async fn execute(
         &self,
         array: ArrayRef,
@@ -67,14 +70,11 @@ async fn execute_typed<T: NativePType + DeviceRepr>(
 
     let len_u64 = len as u64;
 
-    let _events = launch_cuda_kernel!(
-        execution_ctx: ctx,
-        module: "sequence",
-        ptypes: &[T::PTYPE],
-        launch_args: [buffer, base, multiplier, len_u64],
-        event_recording: CU_EVENT_DISABLE_TIMING,
-        array_len: len
-    );
+    let kernel_func = ctx.load_function_ptype("sequence", &[T::PTYPE])?;
+
+    ctx.launch_kernel(&kernel_func, len, |args| {
+        args.arg(&buffer).arg(&base).arg(&multiplier).arg(&len_u64);
+    })?;
 
     let output_buf = BufferHandle::new_device(Arc::new(CudaDeviceBuffer::new(buffer)));
 
diff --git a/vortex-cuda/src/kernel/encodings/zigzag.rs b/vortex-cuda/src/kernel/encodings/zigzag.rs
index 4b265b66a5b..f3726a972b3 100644
--- a/vortex-cuda/src/kernel/encodings/zigzag.rs
+++ b/vortex-cuda/src/kernel/encodings/zigzag.rs
@@ -6,7 +6,6 @@ use std::fmt::Debug;
 use async_trait::async_trait;
 use cudarc::driver::DeviceRepr;
 use cudarc::driver::PushKernelArg;
-use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
 use vortex_array::arrays::PrimitiveArray;
@@ -25,7 +24,6 @@ use crate::CudaBufferExt;
 use crate::executor::CudaArrayExt;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
-use crate::launch_cuda_kernel_impl;
 
 /// CUDA decoder for ZigZag decoding.
 #[derive(Debug)]
@@ -39,6 +37,10 @@ impl ZigZagExecutor {
 
 #[async_trait]
 impl CudaExecute for ZigZagExecutor {
+    #[cfg_attr(
+        feature = "tracing",
+        tracing::instrument(level = "trace", skip_all, fields(self))
+    )]
     async fn execute(
         &self,
         array: ArrayRef,
@@ -82,17 +84,11 @@ where
     let array_len_u64 = array_len as u64;
 
     // Load kernel function
-    let kernel_ptypes = [U::PTYPE];
-    let cuda_function = ctx.load_function_ptype("zigzag", &kernel_ptypes)?;
-    let mut launch_builder = ctx.launch_builder(&cuda_function);
+    let cuda_function = ctx.load_function_ptype("zigzag", &[U::PTYPE])?;
 
-    // Build launch args: buffer, length
-    launch_builder.arg(&cuda_view);
-    launch_builder.arg(&array_len_u64);
-
-    // Launch kernel
-    let _cuda_events =
-        launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, array_len)?;
+    ctx.launch_kernel(&cuda_function, array_len, |args| {
+        args.arg(&cuda_view).arg(&array_len_u64);
+    })?;
 
     // Build result - in-place, reinterpret as signed
     Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle(
diff --git a/vortex-cuda/src/kernel/encodings/zstd.rs b/vortex-cuda/src/kernel/encodings/zstd.rs
index d4b68937047..5db6e955ddd 100644
--- a/vortex-cuda/src/kernel/encodings/zstd.rs
+++ b/vortex-cuda/src/kernel/encodings/zstd.rs
@@ -35,6 +35,7 @@ use vortex_zstd::ZstdVTable;
 
 use crate::CudaBufferExt;
 use crate::CudaDeviceBuffer;
+use crate::debug;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
 
@@ -196,6 +197,10 @@ impl ZstdExecutor {
 
 #[async_trait]
 impl CudaExecute for ZstdExecutor {
+    #[cfg_attr(
+        feature = "tracing",
+        tracing::instrument(level = "trace", skip_all, fields(self))
+    )]
     async fn execute(
         &self,
         array: ArrayRef,
@@ -205,9 +210,9 @@ impl CudaExecute for ZstdExecutor {
 
         match zstd.as_ref().dtype() {
             DType::Binary(_) | DType::Utf8(_) => decode_zstd(zstd, ctx).await,
-            other => {
-                tracing::debug!(
-                    dtype = %other,
+            _other => {
+                debug!(
+                    dtype = %_other,
                     "Only Binary/Utf8 ZSTD arrays supported on GPU, falling back to CPU"
                 );
                 zstd.decompress()?.to_canonical()
@@ -250,6 +255,49 @@ async fn decode_zstd(array: ZstdArray, ctx: &mut CudaExecutionCtx) -> VortexResu
 
     let stream = ctx.stream();
 
+    // NOTE(aduffy): we need to use the explicit tracing/not(tracing) blocks here because we go
+    //  through nvcomp instead of delegating through the LaunchBuilder.
+    //  We should find a way to bridge the two.
+    #[cfg(feature = "tracing")]
+    {
+        let before = stream
+            .record_event(Some(cudarc::driver::sys::CUevent_flags::CU_EVENT_DEFAULT))
+            .map_err(|e| vortex_err!("recording event: {e}"))?;
+        unsafe {
+            nvcomp_zstd::decompress_async(
+                exec.frame_ptrs_ptr as _,
+                exec.frame_sizes_ptr as _,
+                exec.output_sizes_ptr as _,
+                exec.device_actual_sizes.device_ptr_mut(stream).0 as _,
+                exec.num_frames,
+                exec.nvcomp_temp_buffer.device_ptr_mut(stream).0 as _,
+                exec.nvcomp_temp_buffer_size,
+                exec.output_ptrs_ptr as _,
+                exec.device_statuses.device_ptr_mut(stream).0 as _,
+                stream.cu_stream().cast(),
+            )
+            .map_err(|e| vortex_err!("nvcomp decompress_async failed: {}", e))?;
+        }
+
+        let after = stream
+            .record_event(Some(cudarc::driver::sys::CUevent_flags::CU_EVENT_DEFAULT))
+            .map_err(|e| vortex_err!("recording event: {e}"))?;
+
+        // measure timing. note: this forces a sync
+        let duration = crate::CudaKernelEvents {
+            before_launch: before,
+            after_launch: after,
+        }
+        .duration()?;
+
+        crate::trace!(
+            execution_nanos = duration.as_nanos(),
+            len = n_rows,
+            "ZSTD execution"
+        );
+    }
+
+    #[cfg(not(feature = "tracing"))]
     unsafe {
         nvcomp_zstd::decompress_async(
             exec.frame_ptrs_ptr as _,
diff --git a/vortex-cuda/src/kernel/filter/mod.rs b/vortex-cuda/src/kernel/filter/mod.rs
index f76c8f8126c..2e76a659b9b 100644
--- a/vortex-cuda/src/kernel/filter/mod.rs
+++ b/vortex-cuda/src/kernel/filter/mod.rs
@@ -42,6 +42,10 @@ pub struct FilterExecutor;
 
 #[async_trait]
 impl CudaExecute for FilterExecutor {
+    #[cfg_attr(
+        feature = "tracing",
+        tracing::instrument(level = "trace", skip_all, fields(self))
+    )]
     async fn execute(
         &self,
         array: ArrayRef,
diff --git a/vortex-cuda/src/kernel/mod.rs b/vortex-cuda/src/kernel/mod.rs
index 9dca93a0e94..b247e8367f3 100644
--- a/vortex-cuda/src/kernel/mod.rs
+++ b/vortex-cuda/src/kernel/mod.rs
@@ -34,49 +34,54 @@ pub use filter::FilterExecutor;
 pub use slice::SliceExecutor;
 
 use crate::CudaKernelEvents;
+#[cfg(feature = "tracing")]
+use crate::trace;
 
-/// Convenience macro to launch a CUDA kernel.
+/// Trait for customizing kernel launch behavior.
 ///
-/// The kernel gets launched on the stream of the execution context.
-///
-/// The kernel launch config:
-/// LaunchConfig {
-///     grid_dim: (array.len() / 2048, 1, 1),
-///     block_dim: (64, 1, 1),
-///     shared_mem_bytes: 0,
-/// };
-/// 64 threads are used per block which corresponds to 2 warps.
-/// Each block handles 2048 elements. Each thread handles 32 elements.
-/// The last block and thread are allowed to have less elements.
-///
-/// Note: A macro is necessary to unroll the launch builder arguments.
-///
-/// # Returns
-///
-/// A pair of CUDA events submitted before and after the kernel.
-/// Depending on `CUevent_flags` these events can contain timestamps. Use
-/// `CU_EVENT_DISABLE_TIMING` for minimal overhead and `CU_EVENT_DEFAULT` to
-/// enable timestamps.
-#[macro_export]
-macro_rules! launch_cuda_kernel {
-    (
-        execution_ctx: $ctx:expr,
-        module: $module:expr,
-        ptypes: $ptypes:expr,
-        launch_args: [$($arg:expr),* $(,)?],
-        event_recording: $event_recording:expr,
-        array_len: $len:expr
-    ) => {{
-        use ::cudarc::driver::PushKernelArg as _;
-        let cuda_function = $ctx.load_function_ptype($module, $ptypes)?;
-        let mut launch_builder = $ctx.launch_builder(&cuda_function);
-
-        $(
-            launch_builder.arg(&$arg);
-        )*
-
-        $crate::launch_cuda_kernel_impl(&mut launch_builder, $event_recording, $len)?
-    }};
+/// Implementations can add tracing, async callbacks, or other behavior
+/// around kernel launches.
+pub trait LaunchStrategy: Debug + Send + Sync + 'static {
+    /// Returns the event flags to use for this launch.
+    fn event_flags(&self) -> CUevent_flags;
+
+    /// Called after the kernel launch completes with the recorded events.
+    fn on_complete(&self, events: &CudaKernelEvents, len: usize) -> VortexResult<()>;
+}
+
+/// Default launch strategy with no tracing overhead.
+#[derive(Debug)]
+pub struct DefaultLaunchStrategy;
+
+impl LaunchStrategy for DefaultLaunchStrategy {
+    fn event_flags(&self) -> CUevent_flags {
+        CUevent_flags::CU_EVENT_DISABLE_TIMING
+    }
+
+    fn on_complete(&self, _events: &CudaKernelEvents, _len: usize) -> VortexResult<()> {
+        Ok(())
+    }
+}
+
+/// Launch strategy that records timing and emits trace events.
+#[cfg(feature = "tracing")]
+#[derive(Debug)]
+pub struct TracingLaunchStrategy;
+
+#[cfg(feature = "tracing")]
+impl LaunchStrategy for TracingLaunchStrategy {
+    fn event_flags(&self) -> CUevent_flags {
+        CUevent_flags::CU_EVENT_DEFAULT
+    }
+
+    fn on_complete(&self, events: &CudaKernelEvents, len: usize) -> VortexResult<()> {
+        let duration = events.duration()?;
+        trace!(
+            execution_nanos = duration.as_nanos(),
+            len, "execution completed"
+        );
+        Ok(())
+    }
 }
 
 /// Launches a CUDA kernel with the passed launch builder.
@@ -92,7 +97,7 @@ macro_rules! launch_cuda_kernel {
 /// Depending on `CUevent_flags` these events can contain timestamps. Use
 /// `CU_EVENT_DISABLE_TIMING` for minimal overhead and `CU_EVENT_DEFAULT` to
 /// enable timestamps.
-pub fn launch_cuda_kernel_impl(
+pub(crate) fn launch_cuda_kernel_impl(
     launch_builder: &mut LaunchArgs,
     event_flags: CUevent_flags,
     array_len: usize,
@@ -127,7 +132,7 @@ pub fn launch_cuda_kernel_impl(
 /// Depending on `CUevent_flags` these events can contain timestamps. Use
 /// `CU_EVENT_DISABLE_TIMING` for minimal overhead and `CU_EVENT_DEFAULT` to
 /// enable timestamps.
-pub fn launch_cuda_kernel_with_config(
+pub(crate) fn launch_cuda_kernel_with_config(
     launch_builder: &mut LaunchArgs,
     config: LaunchConfig,
     event_flags: CUevent_flags,
@@ -153,7 +158,7 @@ pub fn launch_cuda_kernel_with_config(
 ///
 /// Handles loading PTX files, compiling modules, and loading functions.
 #[derive(Debug)]
-pub struct KernelLoader {
+pub(crate) struct KernelLoader {
     /// Cache of loaded CUDA modules, keyed by module name
     modules: DashMap<String, Arc<CudaModule>>,
 }
diff --git a/vortex-cuda/src/kernel/patches/mod.rs b/vortex-cuda/src/kernel/patches/mod.rs
index bff666ab66a..ced8d822bb5 100644
--- a/vortex-cuda/src/kernel/patches/mod.rs
+++ b/vortex-cuda/src/kernel/patches/mod.rs
@@ -2,7 +2,7 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 use cudarc::driver::DeviceRepr;
-use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING;
+use cudarc::driver::PushKernelArg;
 use vortex_array::arrays::PrimitiveArrayParts;
 use vortex_array::patches::Patches;
 use vortex_array::validity::Validity;
@@ -16,7 +16,6 @@ use crate::CudaBufferExt;
 use crate::CudaDeviceBuffer;
 use crate::CudaExecutionCtx;
 use crate::executor::CudaArrayExt;
-use crate::launch_cuda_kernel;
 
 /// Apply a set of patches in-place onto a [`CudaDeviceBuffer`] holding `ValuesT`.
 pub(crate) async fn execute_patches<
@@ -77,20 +76,14 @@ pub(crate) async fn execute_patches<
     let d_patch_indices_view = d_patch_indices.cuda_view::<IndicesT>()?;
     let d_patch_values_view = d_patch_values.cuda_view::<ValuesT>()?;
 
-    // kernel arg order for patches is values, patchIndices, patchValues, patchesLen
-    let _events = launch_cuda_kernel!(
-        execution_ctx: ctx,
-        module: "patches",
-        ptypes: &[ValuesT::PTYPE, IndicesT::PTYPE],
-        launch_args: [
-            d_target_view,
-            d_patch_indices_view,
-            d_patch_values_view,
-            patches_len_u64,
-        ],
-        event_recording: CU_EVENT_DISABLE_TIMING,
-        array_len: patches_len
-    );
+    let kernel_func = ctx.load_function_ptype("patches", &[ValuesT::PTYPE, IndicesT::PTYPE])?;
+
+    ctx.launch_kernel(&kernel_func, patches_len, |args| {
+        args.arg(&d_target_view)
+            .arg(&d_patch_indices_view)
+            .arg(&d_patch_values_view)
+            .arg(&patches_len_u64);
+    })?;
 
     Ok(target)
 }
diff --git a/vortex-cuda/src/kernel/slice/mod.rs b/vortex-cuda/src/kernel/slice/mod.rs
index 4b19dfc746e..13922bba805 100644
--- a/vortex-cuda/src/kernel/slice/mod.rs
+++ b/vortex-cuda/src/kernel/slice/mod.rs
@@ -19,6 +19,10 @@ pub struct SliceExecutor;
 
 #[async_trait]
 impl CudaExecute for SliceExecutor {
+    #[cfg_attr(
+        feature = "tracing",
+        tracing::instrument(level = "trace", skip_all, fields(self))
+    )]
     async fn execute(
         &self,
         array: ArrayRef,
diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs
index adf66bcf8e2..b117d5e1d64 100644
--- a/vortex-cuda/src/lib.rs
+++ b/vortex-cuda/src/lib.rs
@@ -24,25 +24,27 @@ pub use device_buffer::CudaDeviceBuffer;
 pub use executor::CudaExecutionCtx;
 pub use executor::CudaKernelEvents;
 pub use host_to_device_allocator::CopyDeviceReadAt;
-use kernel::ALPExecutor;
-use kernel::BitPackedExecutor;
-use kernel::ConstantNumericExecutor;
-use kernel::DateTimePartsExecutor;
-use kernel::DecimalBytePartsExecutor;
-use kernel::DictExecutor;
-use kernel::FilterExecutor;
-use kernel::FoRExecutor;
-use kernel::RunEndExecutor;
-use kernel::SharedExecutor;
-use kernel::ZigZagExecutor;
+pub use kernel::ALPExecutor;
+pub use kernel::BitPackedExecutor;
+pub use kernel::ConstantNumericExecutor;
+pub use kernel::DateTimePartsExecutor;
+pub use kernel::DecimalBytePartsExecutor;
+pub use kernel::DefaultLaunchStrategy;
+pub use kernel::DictExecutor;
+pub use kernel::FilterExecutor;
+pub use kernel::FoRExecutor;
+pub use kernel::LaunchStrategy;
+pub use kernel::RunEndExecutor;
+pub use kernel::SharedExecutor;
+#[cfg(feature = "tracing")]
+pub use kernel::TracingLaunchStrategy;
+pub use kernel::ZigZagExecutor;
 #[cfg(feature = "unstable_encodings")]
 use kernel::ZstdBuffersExecutor;
 use kernel::ZstdExecutor;
 pub use kernel::ZstdKernelPrep;
 pub use kernel::bitpacked_cuda_kernel;
 pub use kernel::bitpacked_cuda_launch_config;
-pub use kernel::launch_cuda_kernel_impl;
-pub use kernel::launch_cuda_kernel_with_config;
 pub use kernel::zstd_kernel_prepare;
 pub use session::CudaSession;
 pub use session::CudaSessionExt;
@@ -65,8 +67,8 @@ use vortex_zigzag::ZigZagVTable;
 use vortex_zstd::ZstdBuffersVTable;
 use vortex_zstd::ZstdVTable;
 
-use crate::kernel::SequenceExecutor;
-use crate::kernel::SliceExecutor;
+pub use crate::kernel::SequenceExecutor;
+pub use crate::kernel::SliceExecutor;
 
 /// Checks if CUDA is available on the system by looking for nvcc.
 pub fn cuda_available() -> bool {
@@ -78,7 +80,7 @@ pub fn cuda_available() -> bool {
 
 /// Registers CUDA kernels.
 pub fn initialize_cuda(session: &CudaSession) {
-    tracing::info!("Registering CUDA kernels");
+    info!("Registering CUDA kernels");
     session.register_kernel(ALPVTable::ID, &ALPExecutor);
     session.register_kernel(BitPackedVTable::ID, &BitPackedExecutor);
     session.register_kernel(ConstantVTable::ID, &ConstantNumericExecutor);
diff --git a/vortex-cuda/src/macros.rs b/vortex-cuda/src/macros.rs
new file mode 100644
index 00000000000..e537995cd3e
--- /dev/null
+++ b/vortex-cuda/src/macros.rs
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#[macro_export]
+macro_rules! warn {
+    ($($tts:tt)*) => {
+        #[cfg(feature = "tracing")]
+        {
+            tracing::warn!($($tts)*);
+        }
+    };
+}
+
+#[macro_export]
+macro_rules! info {
+    ($($tts:tt)*) => {
+        #[cfg(feature = "tracing")]
+        {
+            tracing::info!($($tts)*);
+        }
+    };
+}
+
+#[macro_export]
+macro_rules! debug {
+    ($($tts:tt)*) => {
+        #[cfg(feature = "tracing")]
+        {
+            tracing::info!($($tts)*);
+        }
+    };
+}
+
+#[macro_export]
+macro_rules! trace {
+    ($($tts:tt)*) => {
+        #[cfg(feature = "tracing")]
+        {
+            tracing::info!($($tts)*);
+        }
+    };
+}
diff --git a/vortex-cuda/src/session.rs b/vortex-cuda/src/session.rs
index 33233582116..14e42078f38 100644
--- a/vortex-cuda/src/session.rs
+++ b/vortex-cuda/src/session.rs
@@ -16,6 +16,7 @@ use crate::ExportDeviceArray;
 use crate::arrow::CanonicalDeviceArrayExport;
 use crate::executor::CudaExecute;
 pub use crate::executor::CudaExecutionCtx;
+use crate::initialize_cuda;
 use crate::kernel::KernelLoader;
 use crate::stream::VortexCudaStream;
 use crate::stream_pool::VortexCudaStreamPool;
@@ -128,7 +129,7 @@ impl CudaSession {
 }
 
 impl Default for CudaSession {
-    /// Creates a default CUDA session using device 0.
+    /// Creates a default CUDA session using device 0, with all GPU array kernels preloaded.
     ///
     /// # Panics
     ///
@@ -136,7 +137,9 @@ impl Default for CudaSession {
     fn default() -> Self {
         #[expect(clippy::expect_used)]
         let context = CudaContext::new(0).expect("Failed to initialize CUDA device 0");
-        Self::new(context)
+        let this = Self::new(context);
+        initialize_cuda(&this);
+        this
     }
 }
 
diff --git a/vortex-cuda/src/stream.rs b/vortex-cuda/src/stream.rs
index fad54b36ab4..449cd6db072 100644
--- a/vortex-cuda/src/stream.rs
+++ b/vortex-cuda/src/stream.rs
@@ -19,6 +19,7 @@ use vortex_error::VortexResult;
 use vortex_error::vortex_err;
 
 use crate::CudaDeviceBuffer;
+use crate::warn;
 
 #[derive(Clone)]
 pub struct VortexCudaStream(pub Arc<CudaStream>);
@@ -155,10 +156,11 @@ fn register_stream_callback(stream: &CudaStream) -> VortexResult<kanal::AsyncRec
         let tx = unsafe { Box::from_raw(user_data as *mut Sender<()>) };
 
         // Blocking send as we're in a callback invoked by the CUDA driver.
-        #[expect(clippy::expect_used)]
-        tx.send(())
-            // A send should never fail. Panic otherwise.
-            .expect("CUDA callback receiver dropped unexpectedly");
+        // NOTE: send can fail if the CudaEvent is dropped by the caller, in which case the reeciver
+        //  is closed and sends will fail.
+        if let Err(_e) = tx.send(()) {
+            warn!(error = ?_e, "register_stream_callback send failed due to error");
+        }
     }
 
     // SAFETY:
diff --git a/vortex-python/src/arrow.rs b/vortex-python/src/arrow.rs
index ece662ee80e..861602f08c9 100644
--- a/vortex-python/src/arrow.rs
+++ b/vortex-python/src/arrow.rs
@@ -1,7 +1,7 @@
 // SPDX-FileCopyrightText: 2016-2025 Copyright The Apache Software Foundation
 // SPDX-FileCopyrightText: 2025 Copyright the Vortex contributors
 // SPDX-License-Identifier: Apache-2.0
-// SPDX-FileComment: Derived from upstream file arrow-pyarrow/src/lib.rs at commit 549709fb at https://github.com/apache/arrow-rs
+// SPDX-FileComment: Derived from upstream file arrow-pyarrow/src/main at commit 549709fb at https://github.com/apache/arrow-rs
 // SPDX-FileNotice: https://github.com/apache/arrow-rs/blob/549709fbdf91cd1f6c263a7e4540c542b6fecf6b/NOTICE.txt
 #![allow(clippy::same_name_method)]
 
diff --git a/vortex-test/e2e-cuda-scan/Cargo.toml b/vortex-test/e2e-cuda-scan/Cargo.toml
new file mode 100644
index 00000000000..2e7d53e2f75
--- /dev/null
+++ b/vortex-test/e2e-cuda-scan/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "vortex-test-e2e-cuda-scan"
+authors = { workspace = true }
+description = "CUDA scan testing"
+edition = { workspace = true }
+homepage = { workspace = true }
+include = { workspace = true }
+keywords = { workspace = true }
+license = { workspace = true }
+publish = false
+repository = { workspace = true }
+rust-version = { workspace = true }
+version = { workspace = true }
+
+[lints]
+workspace = true
+
+[dependencies]
+futures = { workspace = true, features = ["executor"] }
+tokio = { workspace = true, features = ["macros", "full"] }
+tracing = { workspace = true, features = ["std", "attributes"] }
+tracing-subscriber = { workspace = true, features = ["env-filter", "json"] }
+vortex = { workspace = true }
+vortex-cuda = { workspace = true, features = ["_test-harness", "tracing"] }
diff --git a/vortex-test/e2e-cuda-scan/src/main.rs b/vortex-test/e2e-cuda-scan/src/main.rs
new file mode 100644
index 00000000000..b82af917193
--- /dev/null
+++ b/vortex-test/e2e-cuda-scan/src/main.rs
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::env::args;
+use std::path::Path;
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use futures::StreamExt;
+use tracing::Instrument;
+use tracing_subscriber::EnvFilter;
+use tracing_subscriber::fmt::format::FmtSpan;
+use vortex::VortexSessionDefault;
+use vortex::array::ToCanonical;
+use vortex::array::arrays::DictVTable;
+use vortex::buffer::ByteBuffer;
+use vortex::buffer::ByteBufferMut;
+use vortex::compressor::BtrBlocksCompressorBuilder;
+use vortex::compressor::FloatCode;
+use vortex::compressor::IntCode;
+use vortex::compressor::StringCode;
+use vortex::error::VortexResult;
+use vortex::file::Footer;
+use vortex::file::OpenOptionsSessionExt;
+use vortex::file::WriteOptionsSessionExt;
+use vortex::file::WriteStrategyBuilder;
+use vortex::session::VortexSession;
+use vortex_cuda::CopyDeviceReadAt;
+use vortex_cuda::CudaSession;
+use vortex_cuda::TracingLaunchStrategy;
+use vortex_cuda::VortexCudaStreamPool;
+use vortex_cuda::executor::CudaArrayExt;
+
+#[tokio::main]
+pub async fn main() -> VortexResult<()> {
+    let args: Vec<String> = args().collect();
+    let json_output = args.iter().any(|arg| arg == "--json");
+
+    if json_output {
+        tracing_subscriber::fmt()
+            .json()
+            .with_env_filter(EnvFilter::from_default_env())
+            .with_span_events(FmtSpan::NONE)
+            .with_ansi(false)
+            .init();
+    } else {
+        tracing_subscriber::fmt()
+            .pretty()
+            .with_env_filter(EnvFilter::from_default_env())
+            .with_span_events(FmtSpan::NONE)
+            .with_ansi(false)
+            .event_format(tracing_subscriber::fmt::format().with_target(true))
+            .init();
+    }
+
+    let session = VortexSession::default();
+    let mut cuda_ctx = CudaSession::create_execution_ctx(&session)?
+        .with_launch_strategy(Arc::new(TracingLaunchStrategy));
+
+    #[allow(clippy::expect_used, clippy::unwrap_in_result)]
+    let input_path = args
+        .iter()
+        .skip(1)
+        .find(|arg| !arg.starts_with("--"))
+        .expect("must provide path to .vortex file");
+    let input_path = PathBuf::from(input_path);
+
+    assert!(input_path.exists(), "input path does not exist");
+
+    let (recompressed, footer) = recompress_for_gpu(input_path, &session).await?;
+
+    // Create a full scan that executes on the GPU
+    let cuda_stream =
+        VortexCudaStreamPool::new(Arc::clone(cuda_ctx.stream().context()), 1).get_stream()?;
+    let gpu_reader = CopyDeviceReadAt::new(recompressed, cuda_stream);
+
+    let gpu_file = session
+        .open_options()
+        .with_footer(footer)
+        .open(Arc::new(gpu_reader))
+        .await?;
+
+    // execute_micros => µs to execute
+    let mut batches = gpu_file.scan()?.into_array_stream()?;
+
+    let mut chunk = 0;
+    while let Some(next) = batches.next().await.transpose()? {
+        let record = next.to_struct();
+
+        for (field, field_name) in record
+            .unmasked_fields()
+            .iter()
+            .zip(record.struct_fields().names().iter())
+        {
+            let field_name = field_name.to_string();
+            // skip dict, varbin isn't properly implemented.
+            if field.is::<DictVTable>() {
+                continue;
+            }
+
+            let span =
+                tracing::info_span!("array execution", chunk = chunk, field_name = field_name);
+
+            async {
+                if field.clone().execute_cuda(&mut cuda_ctx).await.is_err() {
+                    tracing::error!("failed to execute_cuda on column");
+                }
+            }
+            .instrument(span)
+            .await;
+        }
+
+        chunk += 1;
+    }
+
+    Ok(())
+}
+
+// Dump the values out as a new Vortex file for analysis.
+
+/// Recompress the input file using only GPU-executable encodings, returning the file as an
+/// in-memory byte array.
+async fn recompress_for_gpu(
+    input_path: impl AsRef<Path>,
+    session: &VortexSession,
+) -> VortexResult<(ByteBuffer, Footer)> {
+    // Setup the reader
+    let input = session.open_options().open_path(input_path).await?;
+
+    // Build a scan to read all columns from the input, and recompress them using only GPU-compatible
+    // encodings.
+    let scan = input.scan()?.into_array_stream()?;
+
+    // Rebuild a copy of the file that only uses GPU-compatible compression algorithms.
+    let compressor = BtrBlocksCompressorBuilder::empty()
+        .include_int([
+            IntCode::Uncompressed,
+            IntCode::Constant,
+            IntCode::BitPacking,
+            IntCode::For,
+            IntCode::Sequence,
+            IntCode::ZigZag,
+            IntCode::Dict,
+        ])
+        .include_float([
+            FloatCode::Uncompressed,
+            FloatCode::Constant,
+            FloatCode::Alp,
+            FloatCode::AlpRd,
+            FloatCode::RunEnd,
+        ])
+        // Don't compress strings, this is b/c we don't have any BtrBlocks encodings that support
+        // strings.
+        .include_string([
+            StringCode::Uncompressed,
+            StringCode::Constant,
+            StringCode::Dict,
+            StringCode::Zstd,
+            StringCode::ZstdBuffers,
+        ])
+        .build();
+
+    // Read an input stream from a Vortex file.
+    let writer = WriteStrategyBuilder::default()
+        .with_compressor(compressor)
+        .build();
+
+    // Segment sink?
+    let mut out = ByteBufferMut::empty();
+    let result = session
+        .write_options()
+        .with_strategy(writer)
+        .write(&mut out, scan)
+        .await?;
+
+    Ok((out.freeze(), result.footer().clone()))
+}
diff --git a/vortex/src/lib.rs b/vortex/src/lib.rs
index 23d12726e87..c989d634cb2 100644
--- a/vortex/src/lib.rs
+++ b/vortex/src/lib.rs
@@ -33,6 +33,9 @@ pub mod compute2 {
 pub mod compressor {
     pub use vortex_btrblocks::BtrBlocksCompressor;
     pub use vortex_btrblocks::BtrBlocksCompressorBuilder;
+    pub use vortex_btrblocks::FloatCode;
+    pub use vortex_btrblocks::IntCode;
+    pub use vortex_btrblocks::StringCode;
 }
 
 pub mod dtype {

From bdc0f08bd14f91055023b3664f4d9089e1639807 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Fri, 13 Feb 2026 09:59:57 -0500
Subject: [PATCH 02/14] no pub *Executor

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 vortex-cuda/benches/bitpacked_cuda.rs         |  5 ++--
 vortex-cuda/benches/date_time_parts_cuda.rs   |  8 ++----
 vortex-cuda/benches/dict_cuda.rs              |  5 ++--
 vortex-cuda/benches/for_cuda.rs               |  7 ++---
 vortex-cuda/benches/runend_cuda.rs            |  8 ++----
 vortex-cuda/src/kernel/arrays/constant.rs     |  3 +-
 vortex-cuda/src/kernel/arrays/dict.rs         |  3 +-
 vortex-cuda/src/kernel/arrays/mod.rs          |  6 ++--
 vortex-cuda/src/kernel/arrays/shared.rs       |  3 +-
 vortex-cuda/src/kernel/encodings/alp.rs       |  3 +-
 vortex-cuda/src/kernel/encodings/bitpacked.rs |  3 +-
 .../src/kernel/encodings/date_time_parts.rs   |  3 +-
 .../kernel/encodings/decimal_byte_parts.rs    |  3 +-
 vortex-cuda/src/kernel/encodings/for_.rs      |  3 +-
 vortex-cuda/src/kernel/encodings/mod.rs       | 22 +++++++--------
 vortex-cuda/src/kernel/encodings/runend.rs    |  3 +-
 vortex-cuda/src/kernel/encodings/sequence.rs  |  2 +-
 vortex-cuda/src/kernel/encodings/zigzag.rs    |  2 +-
 vortex-cuda/src/kernel/encodings/zstd.rs      |  2 +-
 .../src/kernel/encodings/zstd_buffers.rs      |  2 +-
 vortex-cuda/src/kernel/mod.rs                 | 14 ++++++----
 vortex-cuda/src/lib.rs                        | 28 +++++++++----------
 22 files changed, 58 insertions(+), 80 deletions(-)

diff --git a/vortex-cuda/benches/bitpacked_cuda.rs b/vortex-cuda/benches/bitpacked_cuda.rs
index 0ef0e7f03f0..fb859607e12 100644
--- a/vortex-cuda/benches/bitpacked_cuda.rs
+++ b/vortex-cuda/benches/bitpacked_cuda.rs
@@ -22,9 +22,8 @@ use futures::executor::block_on;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::validity::Validity::NonNullable;
 use vortex_buffer::Buffer;
-use vortex_cuda::BitPackedExecutor;
 use vortex_cuda::CudaSession;
-use vortex_cuda::executor::CudaExecute;
+use vortex_cuda::executor::CudaArrayExt;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 use vortex_dtype::NativePType;
@@ -83,7 +82,7 @@ where
                     .with_launch_strategy(Arc::new(timed));
 
                 for _ in 0..iters {
-                    block_on(BitPackedExecutor.execute(array.to_array(), &mut cuda_ctx)).unwrap();
+                    block_on(array.to_array().execute_cuda(&mut cuda_ctx)).unwrap();
                 }
 
                 Duration::from_nanos(timer.load(Ordering::Relaxed))
diff --git a/vortex-cuda/benches/date_time_parts_cuda.rs b/vortex-cuda/benches/date_time_parts_cuda.rs
index 4a142974082..f378630cd8b 100644
--- a/vortex-cuda/benches/date_time_parts_cuda.rs
+++ b/vortex-cuda/benches/date_time_parts_cuda.rs
@@ -23,8 +23,7 @@ use vortex_array::arrays::PrimitiveArray;
 use vortex_array::validity::Validity;
 use vortex_buffer::Buffer;
 use vortex_cuda::CudaSession;
-use vortex_cuda::DateTimePartsExecutor;
-use vortex_cuda::executor::CudaExecute;
+use vortex_cuda::executor::CudaArrayExt;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 use vortex_datetime_parts::DateTimePartsArray;
@@ -77,10 +76,7 @@ fn benchmark_datetimeparts(c: &mut Criterion) {
 
                     for _ in 0..iters {
                         // block on immediately here
-                        block_on(
-                            DateTimePartsExecutor.execute(dtp_array.to_array(), &mut cuda_ctx),
-                        )
-                        .unwrap();
+                        block_on(dtp_array.to_array().execute_cuda(&mut cuda_ctx)).unwrap();
                     }
 
                     Duration::from_nanos(timer.load(Ordering::Relaxed))
diff --git a/vortex-cuda/benches/dict_cuda.rs b/vortex-cuda/benches/dict_cuda.rs
index 5c1ae658b38..5ef61bc7d6c 100644
--- a/vortex-cuda/benches/dict_cuda.rs
+++ b/vortex-cuda/benches/dict_cuda.rs
@@ -24,8 +24,7 @@ use vortex_array::arrays::PrimitiveArray;
 use vortex_array::validity::Validity::NonNullable;
 use vortex_buffer::Buffer;
 use vortex_cuda::CudaSession;
-use vortex_cuda::DictExecutor;
-use vortex_cuda::executor::CudaExecute;
+use vortex_cuda::executor::CudaArrayExt;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 use vortex_dtype::NativePType;
@@ -101,7 +100,7 @@ where
                         .with_launch_strategy(Arc::new(timed));
 
                     for _ in 0..iters {
-                        block_on(DictExecutor.execute(dict_array.to_array(), &mut cuda_ctx))
+                        block_on(dict_array.to_array().execute_cuda(&mut cuda_ctx))
                             .vortex_expect("execute");
                     }
 
diff --git a/vortex-cuda/benches/for_cuda.rs b/vortex-cuda/benches/for_cuda.rs
index 56b50486750..ce45b3d6041 100644
--- a/vortex-cuda/benches/for_cuda.rs
+++ b/vortex-cuda/benches/for_cuda.rs
@@ -24,8 +24,7 @@ use vortex_array::arrays::PrimitiveArray;
 use vortex_array::validity::Validity;
 use vortex_buffer::Buffer;
 use vortex_cuda::CudaSession;
-use vortex_cuda::FoRExecutor;
-use vortex_cuda::executor::CudaExecute;
+use vortex_cuda::executor::CudaArrayExt;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 use vortex_dtype::NativePType;
@@ -93,7 +92,7 @@ where
                         .with_launch_strategy(Arc::new(timed));
 
                     for _ in 0..iters {
-                        block_on(FoRExecutor.execute(for_array.to_array(), &mut cuda_ctx)).unwrap();
+                        block_on(for_array.to_array().execute_cuda(&mut cuda_ctx)).unwrap();
                     }
 
                     Duration::from_nanos(timer.load(Ordering::Relaxed))
@@ -131,7 +130,7 @@ where
                         .with_launch_strategy(Arc::new(timed));
 
                     for _ in 0..iters {
-                        block_on(FoRExecutor.execute(for_array.to_array(), &mut cuda_ctx)).unwrap();
+                        block_on(for_array.to_array().execute_cuda(&mut cuda_ctx)).unwrap();
                     }
 
                     Duration::from_nanos(timer.load(Ordering::Relaxed))
diff --git a/vortex-cuda/benches/runend_cuda.rs b/vortex-cuda/benches/runend_cuda.rs
index 0ce1a37e11b..7e5f4e5906d 100644
--- a/vortex-cuda/benches/runend_cuda.rs
+++ b/vortex-cuda/benches/runend_cuda.rs
@@ -23,8 +23,7 @@ use vortex_array::arrays::PrimitiveArray;
 use vortex_array::validity::Validity;
 use vortex_buffer::Buffer;
 use vortex_cuda::CudaSession;
-use vortex_cuda::RunEndExecutor;
-use vortex_cuda::executor::CudaExecute;
+use vortex_cuda::executor::CudaArrayExt;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 use vortex_dtype::NativePType;
@@ -90,10 +89,7 @@ where
                                 .with_launch_strategy(Arc::new(timed));
 
                         for _ in 0..iters {
-                            block_on(
-                                RunEndExecutor.execute(runend_array.to_array(), &mut cuda_ctx),
-                            )
-                            .unwrap();
+                            block_on(runend_array.to_array().execute_cuda(&mut cuda_ctx)).unwrap();
                         }
 
                         Duration::from_nanos(timer.load(Ordering::Relaxed))
diff --git a/vortex-cuda/src/kernel/arrays/constant.rs b/vortex-cuda/src/kernel/arrays/constant.rs
index b9acb133d07..8f0451d31e6 100644
--- a/vortex-cuda/src/kernel/arrays/constant.rs
+++ b/vortex-cuda/src/kernel/arrays/constant.rs
@@ -36,8 +36,7 @@ use crate::executor::CudaExecutionCtx;
 /// Materializes a constant array by filling a device buffer with the scalar value.
 /// Supports primitive types (integers, floats) and decimal types (i128, i256).
 #[derive(Debug)]
-#[doc(hidden)]
-pub struct ConstantNumericExecutor;
+pub(crate) struct ConstantNumericExecutor;
 
 impl ConstantNumericExecutor {
     fn try_specialize(array: ArrayRef) -> Option<ConstantArray> {
diff --git a/vortex-cuda/src/kernel/arrays/dict.rs b/vortex-cuda/src/kernel/arrays/dict.rs
index 3aae120ac72..29a72d9cf44 100644
--- a/vortex-cuda/src/kernel/arrays/dict.rs
+++ b/vortex-cuda/src/kernel/arrays/dict.rs
@@ -37,8 +37,7 @@ use crate::executor::CudaExecutionCtx;
 
 /// CUDA executor for dictionary-encoded arrays.
 #[derive(Debug)]
-#[doc(hidden)]
-pub struct DictExecutor;
+pub(crate) struct DictExecutor;
 
 #[async_trait]
 impl CudaExecute for DictExecutor {
diff --git a/vortex-cuda/src/kernel/arrays/mod.rs b/vortex-cuda/src/kernel/arrays/mod.rs
index dc3a9a80fba..ab81934bb27 100644
--- a/vortex-cuda/src/kernel/arrays/mod.rs
+++ b/vortex-cuda/src/kernel/arrays/mod.rs
@@ -5,6 +5,6 @@ mod constant;
 mod dict;
 mod shared;
 
-pub use constant::ConstantNumericExecutor;
-pub use dict::DictExecutor;
-pub use shared::SharedExecutor;
+pub(crate) use constant::ConstantNumericExecutor;
+pub(crate) use dict::DictExecutor;
+pub(crate) use shared::SharedExecutor;
diff --git a/vortex-cuda/src/kernel/arrays/shared.rs b/vortex-cuda/src/kernel/arrays/shared.rs
index aba9f6ffb43..cae0dc68988 100644
--- a/vortex-cuda/src/kernel/arrays/shared.rs
+++ b/vortex-cuda/src/kernel/arrays/shared.rs
@@ -14,8 +14,7 @@ use crate::executor::CudaExecutionCtx;
 
 /// CUDA executor for SharedArray.
 #[derive(Debug)]
-#[doc(hidden)]
-pub struct SharedExecutor;
+pub(crate) struct SharedExecutor;
 
 #[async_trait]
 impl CudaExecute for SharedExecutor {
diff --git a/vortex-cuda/src/kernel/encodings/alp.rs b/vortex-cuda/src/kernel/encodings/alp.rs
index ac1bd225f16..ff95fd9ed41 100644
--- a/vortex-cuda/src/kernel/encodings/alp.rs
+++ b/vortex-cuda/src/kernel/encodings/alp.rs
@@ -33,8 +33,7 @@ use crate::kernel::patches::execute_patches;
 
 /// CUDA decoder for ALP (Adaptive Lossless floating-Point) decompression.
 #[derive(Debug)]
-#[doc(hidden)]
-pub struct ALPExecutor;
+pub(crate) struct ALPExecutor;
 
 #[async_trait]
 impl CudaExecute for ALPExecutor {
diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs
index bb115fe08ec..9f3f8641fa3 100644
--- a/vortex-cuda/src/kernel/encodings/bitpacked.rs
+++ b/vortex-cuda/src/kernel/encodings/bitpacked.rs
@@ -35,8 +35,7 @@ use crate::kernel::patches::execute_patches;
 
 /// CUDA decoder for bit-packed arrays.
 #[derive(Debug)]
-#[doc(hidden)]
-pub struct BitPackedExecutor;
+pub(crate) struct BitPackedExecutor;
 
 impl BitPackedExecutor {
     fn try_specialize(array: ArrayRef) -> Option<BitPackedArray> {
diff --git a/vortex-cuda/src/kernel/encodings/date_time_parts.rs b/vortex-cuda/src/kernel/encodings/date_time_parts.rs
index 154802546ec..ab5d1da1785 100644
--- a/vortex-cuda/src/kernel/encodings/date_time_parts.rs
+++ b/vortex-cuda/src/kernel/encodings/date_time_parts.rs
@@ -39,8 +39,7 @@ use crate::executor::CudaExecutionCtx;
 ///
 /// Combines the days, seconds, and subseconds components into a single i64 timestamp array.
 #[derive(Debug)]
-#[doc(hidden)]
-pub struct DateTimePartsExecutor;
+pub(crate) struct DateTimePartsExecutor;
 
 #[async_trait]
 impl CudaExecute for DateTimePartsExecutor {
diff --git a/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs b/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs
index 37ac1a3e2ed..042de8802d2 100644
--- a/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs
+++ b/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs
@@ -20,8 +20,7 @@ use crate::executor::CudaExecute;
 
 // See `DecimalBytePartsArray`
 #[derive(Debug)]
-#[doc(hidden)]
-pub struct DecimalBytePartsExecutor;
+pub(crate) struct DecimalBytePartsExecutor;
 
 #[async_trait]
 impl CudaExecute for DecimalBytePartsExecutor {
diff --git a/vortex-cuda/src/kernel/encodings/for_.rs b/vortex-cuda/src/kernel/encodings/for_.rs
index 4a09d07071f..d5d8aa7ce48 100644
--- a/vortex-cuda/src/kernel/encodings/for_.rs
+++ b/vortex-cuda/src/kernel/encodings/for_.rs
@@ -28,8 +28,7 @@ use crate::executor::CudaExecutionCtx;
 
 /// CUDA decoder for frame-of-reference.
 #[derive(Debug)]
-#[doc(hidden)]
-pub struct FoRExecutor;
+pub(crate) struct FoRExecutor;
 
 impl FoRExecutor {
     fn try_specialize(array: ArrayRef) -> Option<FoRArray> {
diff --git a/vortex-cuda/src/kernel/encodings/mod.rs b/vortex-cuda/src/kernel/encodings/mod.rs
index b26ca50e1cc..62a8d9f606d 100644
--- a/vortex-cuda/src/kernel/encodings/mod.rs
+++ b/vortex-cuda/src/kernel/encodings/mod.rs
@@ -13,18 +13,16 @@ mod zstd;
 #[cfg(feature = "unstable_encodings")]
 mod zstd_buffers;
 
-pub use alp::ALPExecutor;
-pub use bitpacked::BitPackedExecutor;
-pub use bitpacked::bitpacked_cuda_kernel;
-pub use bitpacked::bitpacked_cuda_launch_config;
-pub use date_time_parts::DateTimePartsExecutor;
-pub use decimal_byte_parts::DecimalBytePartsExecutor;
-pub use for_::FoRExecutor;
-pub use runend::RunEndExecutor;
-pub use sequence::SequenceExecutor;
-pub use zigzag::ZigZagExecutor;
-pub use zstd::ZstdExecutor;
+pub(crate) use alp::ALPExecutor;
+pub(crate) use bitpacked::BitPackedExecutor;
+pub(crate) use date_time_parts::DateTimePartsExecutor;
+pub(crate) use decimal_byte_parts::DecimalBytePartsExecutor;
+pub(crate) use for_::FoRExecutor;
+pub(crate) use runend::RunEndExecutor;
+pub(crate) use sequence::SequenceExecutor;
+pub(crate) use zigzag::ZigZagExecutor;
+pub(crate) use zstd::ZstdExecutor;
 pub use zstd::ZstdKernelPrep;
 pub use zstd::zstd_kernel_prepare;
 #[cfg(feature = "unstable_encodings")]
-pub use zstd_buffers::ZstdBuffersExecutor;
+pub(crate) use zstd_buffers::ZstdBuffersExecutor;
diff --git a/vortex-cuda/src/kernel/encodings/runend.rs b/vortex-cuda/src/kernel/encodings/runend.rs
index 29a0098034a..9da8b636cd1 100644
--- a/vortex-cuda/src/kernel/encodings/runend.rs
+++ b/vortex-cuda/src/kernel/encodings/runend.rs
@@ -35,8 +35,7 @@ use crate::executor::CudaExecutionCtx;
 
 /// CUDA executor for run-end encoded arrays.
 #[derive(Debug)]
-#[doc(hidden)]
-pub struct RunEndExecutor;
+pub(crate) struct RunEndExecutor;
 
 impl RunEndExecutor {
     fn try_specialize(array: ArrayRef) -> Option<RunEndArray> {
diff --git a/vortex-cuda/src/kernel/encodings/sequence.rs b/vortex-cuda/src/kernel/encodings/sequence.rs
index 0a7de984f47..354e26b0629 100644
--- a/vortex-cuda/src/kernel/encodings/sequence.rs
+++ b/vortex-cuda/src/kernel/encodings/sequence.rs
@@ -25,7 +25,7 @@ use crate::executor::CudaExecute;
 
 /// CUDA execution for `SequenceArray`.
 #[derive(Debug)]
-pub struct SequenceExecutor;
+pub(crate) struct SequenceExecutor;
 
 #[async_trait]
 impl CudaExecute for SequenceExecutor {
diff --git a/vortex-cuda/src/kernel/encodings/zigzag.rs b/vortex-cuda/src/kernel/encodings/zigzag.rs
index f3726a972b3..afbbc0458e5 100644
--- a/vortex-cuda/src/kernel/encodings/zigzag.rs
+++ b/vortex-cuda/src/kernel/encodings/zigzag.rs
@@ -27,7 +27,7 @@ use crate::executor::CudaExecutionCtx;
 
 /// CUDA decoder for ZigZag decoding.
 #[derive(Debug)]
-pub struct ZigZagExecutor;
+pub(crate) struct ZigZagExecutor;
 
 impl ZigZagExecutor {
     fn try_specialize(array: ArrayRef) -> Option<ZigZagArray> {
diff --git a/vortex-cuda/src/kernel/encodings/zstd.rs b/vortex-cuda/src/kernel/encodings/zstd.rs
index 5db6e955ddd..c9980415c62 100644
--- a/vortex-cuda/src/kernel/encodings/zstd.rs
+++ b/vortex-cuda/src/kernel/encodings/zstd.rs
@@ -187,7 +187,7 @@ pub async fn zstd_kernel_prepare(
 
 /// CUDA executor for ZSTD decompression using nvCOMP.
 #[derive(Debug)]
-pub struct ZstdExecutor;
+pub(crate) struct ZstdExecutor;
 
 impl ZstdExecutor {
     fn try_specialize(array: ArrayRef) -> Option<ZstdArray> {
diff --git a/vortex-cuda/src/kernel/encodings/zstd_buffers.rs b/vortex-cuda/src/kernel/encodings/zstd_buffers.rs
index 56207741cbf..509ed41f121 100644
--- a/vortex-cuda/src/kernel/encodings/zstd_buffers.rs
+++ b/vortex-cuda/src/kernel/encodings/zstd_buffers.rs
@@ -34,7 +34,7 @@ use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
 
 #[derive(Debug)]
-pub struct ZstdBuffersExecutor;
+pub(crate) struct ZstdBuffersExecutor;
 
 #[async_trait]
 impl CudaExecute for ZstdBuffersExecutor {
diff --git a/vortex-cuda/src/kernel/mod.rs b/vortex-cuda/src/kernel/mod.rs
index b247e8367f3..0bca57a15e1 100644
--- a/vortex-cuda/src/kernel/mod.rs
+++ b/vortex-cuda/src/kernel/mod.rs
@@ -26,12 +26,14 @@ mod filter;
 mod patches;
 mod slice;
 
-pub use arrays::ConstantNumericExecutor;
-pub use arrays::DictExecutor;
-pub use arrays::SharedExecutor;
-pub use encodings::*;
-pub use filter::FilterExecutor;
-pub use slice::SliceExecutor;
+pub(crate) use arrays::ConstantNumericExecutor;
+pub(crate) use arrays::DictExecutor;
+pub(crate) use arrays::SharedExecutor;
+pub use encodings::ZstdKernelPrep;
+pub use encodings::zstd_kernel_prepare;
+pub(crate) use encodings::*;
+pub(crate) use filter::FilterExecutor;
+pub(crate) use slice::SliceExecutor;
 
 use crate::CudaKernelEvents;
 #[cfg(feature = "tracing")]
diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs
index b117d5e1d64..9b5b9c3bd6f 100644
--- a/vortex-cuda/src/lib.rs
+++ b/vortex-cuda/src/lib.rs
@@ -24,27 +24,25 @@ pub use device_buffer::CudaDeviceBuffer;
 pub use executor::CudaExecutionCtx;
 pub use executor::CudaKernelEvents;
 pub use host_to_device_allocator::CopyDeviceReadAt;
-pub use kernel::ALPExecutor;
-pub use kernel::BitPackedExecutor;
-pub use kernel::ConstantNumericExecutor;
-pub use kernel::DateTimePartsExecutor;
-pub use kernel::DecimalBytePartsExecutor;
+use kernel::ALPExecutor;
+use kernel::BitPackedExecutor;
+use kernel::ConstantNumericExecutor;
+use kernel::DateTimePartsExecutor;
+use kernel::DecimalBytePartsExecutor;
 pub use kernel::DefaultLaunchStrategy;
-pub use kernel::DictExecutor;
-pub use kernel::FilterExecutor;
-pub use kernel::FoRExecutor;
+use kernel::DictExecutor;
+use kernel::FilterExecutor;
+use kernel::FoRExecutor;
 pub use kernel::LaunchStrategy;
-pub use kernel::RunEndExecutor;
-pub use kernel::SharedExecutor;
+use kernel::RunEndExecutor;
+use kernel::SharedExecutor;
 #[cfg(feature = "tracing")]
 pub use kernel::TracingLaunchStrategy;
-pub use kernel::ZigZagExecutor;
+use kernel::ZigZagExecutor;
 #[cfg(feature = "unstable_encodings")]
 use kernel::ZstdBuffersExecutor;
 use kernel::ZstdExecutor;
 pub use kernel::ZstdKernelPrep;
-pub use kernel::bitpacked_cuda_kernel;
-pub use kernel::bitpacked_cuda_launch_config;
 pub use kernel::zstd_kernel_prepare;
 pub use session::CudaSession;
 pub use session::CudaSessionExt;
@@ -67,8 +65,8 @@ use vortex_zigzag::ZigZagVTable;
 use vortex_zstd::ZstdBuffersVTable;
 use vortex_zstd::ZstdVTable;
 
-pub use crate::kernel::SequenceExecutor;
-pub use crate::kernel::SliceExecutor;
+use crate::kernel::SequenceExecutor;
+use crate::kernel::SliceExecutor;
 
 /// Checks if CUDA is available on the system by looking for nvcc.
 pub fn cuda_available() -> bool {

From 878105a60d1df01ad41c3d0a7e95da624bcb6cc2 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Fri, 13 Feb 2026 10:04:23 -0500
Subject: [PATCH 03/14] move crate

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 Cargo.lock                                    | 24 +++++++++----------
 Cargo.toml                                    |  2 +-
 .../gpu-scan-cli}/Cargo.toml                  |  2 +-
 vortex-cuda/gpu-scan-cli/README.md            | 19 +++++++++++++++
 .../gpu-scan-cli}/src/main.rs                 |  0
 5 files changed, 33 insertions(+), 14 deletions(-)
 rename {vortex-test/e2e-cuda-scan => vortex-cuda/gpu-scan-cli}/Cargo.toml (95%)
 create mode 100644 vortex-cuda/gpu-scan-cli/README.md
 rename {vortex-test/e2e-cuda-scan => vortex-cuda/gpu-scan-cli}/src/main.rs (100%)

diff --git a/Cargo.lock b/Cargo.lock
index b186b96695a..c5081a7a5c2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4118,6 +4118,18 @@ dependencies = [
  "yansi",
 ]
 
+[[package]]
+name = "gpu-scan-cli"
+version = "0.1.0"
+dependencies = [
+ "futures",
+ "tokio",
+ "tracing",
+ "tracing-subscriber",
+ "vortex",
+ "vortex-cuda",
+]
+
 [[package]]
 name = "grid"
 version = "1.0.0"
@@ -10844,18 +10856,6 @@ dependencies = [
  "vortex-cuda",
 ]
 
-[[package]]
-name = "vortex-test-e2e-cuda-scan"
-version = "0.1.0"
-dependencies = [
- "futures",
- "tokio",
- "tracing",
- "tracing-subscriber",
- "vortex",
- "vortex-cuda",
-]
-
 [[package]]
 name = "vortex-tui"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index 7ab1c634acf..387a1ec7733 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -25,6 +25,7 @@ members = [
     "vortex-duckdb",
     "vortex-cuda",
     "vortex-cuda/cub",
+    "vortex-cuda/gpu-scan-cli",
     "vortex-cuda/macros",
     "vortex-cuda/nvcomp",
     "vortex-cxx",
@@ -35,7 +36,6 @@ members = [
     "vortex-tui",
     "vortex-test/e2e",
     "vortex-test/e2e-cuda",
-    "vortex-test/e2e-cuda-scan",
     "xtask",
     # Encodings
     "encodings/fastlanes",
diff --git a/vortex-test/e2e-cuda-scan/Cargo.toml b/vortex-cuda/gpu-scan-cli/Cargo.toml
similarity index 95%
rename from vortex-test/e2e-cuda-scan/Cargo.toml
rename to vortex-cuda/gpu-scan-cli/Cargo.toml
index 2e7d53e2f75..1ed2dd3e1a4 100644
--- a/vortex-test/e2e-cuda-scan/Cargo.toml
+++ b/vortex-cuda/gpu-scan-cli/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "vortex-test-e2e-cuda-scan"
+name = "gpu-scan-cli"
 authors = { workspace = true }
 description = "CUDA scan testing"
 edition = { workspace = true }
diff --git a/vortex-cuda/gpu-scan-cli/README.md b/vortex-cuda/gpu-scan-cli/README.md
new file mode 100644
index 00000000000..90210dc3ed8
--- /dev/null
+++ b/vortex-cuda/gpu-scan-cli/README.md
@@ -0,0 +1,19 @@
+# gpu-scan-cli
+
+A CLI tool for benchmarking CUDA-accelerated scans of Vortex files.
+
+## What it does
+
+1. Reads a Vortex file from disk
+2. Recompresses it using only GPU-compatible encodings
+3. Executes a full scan on the GPU via CUDA
+4. Outputs tracing information about kernel execution times
+
+## Usage
+
+```bash
+FLAT_LAYOUT_INLINE_ARRAY_NODE=true RUST_LOG=vortex_cuda=trace,info \
+    cargo run --release --bin gpu-scan-cli -- ./path/to/file.vortex
+```
+
+Use `--json` for JSON-formatted trace output.
diff --git a/vortex-test/e2e-cuda-scan/src/main.rs b/vortex-cuda/gpu-scan-cli/src/main.rs
similarity index 100%
rename from vortex-test/e2e-cuda-scan/src/main.rs
rename to vortex-cuda/gpu-scan-cli/src/main.rs

From dedc91626c928f87d61f7a9111898c5e02686959 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Fri, 13 Feb 2026 10:09:24 -0500
Subject: [PATCH 04/14] lockfiles

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 vortex-btrblocks/public-api.lock | 2 ++
 vortex/public-api.lock           | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/vortex-btrblocks/public-api.lock b/vortex-btrblocks/public-api.lock
index 30aa578343e..bc5af615759 100644
--- a/vortex-btrblocks/public-api.lock
+++ b/vortex-btrblocks/public-api.lock
@@ -222,6 +222,8 @@ impl vortex_btrblocks::BtrBlocksCompressorBuilder
 
 pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::build(self) -> vortex_btrblocks::BtrBlocksCompressor
 
+pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::empty() -> Self
+
 pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::exclude_float(self, codes: impl core::iter::traits::collect::IntoIterator<Item = vortex_btrblocks::FloatCode>) -> Self
 
 pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::exclude_int(self, codes: impl core::iter::traits::collect::IntoIterator<Item = vortex_btrblocks::IntCode>) -> Self
diff --git a/vortex/public-api.lock b/vortex/public-api.lock
index 6a7a137a0be..cba20f93640 100644
--- a/vortex/public-api.lock
+++ b/vortex/public-api.lock
@@ -18,6 +18,12 @@ pub use vortex::compressor::BtrBlocksCompressor
 
 pub use vortex::compressor::BtrBlocksCompressorBuilder
 
+pub use vortex::compressor::FloatCode
+
+pub use vortex::compressor::IntCode
+
+pub use vortex::compressor::StringCode
+
 pub mod vortex::compute2
 
 pub use vortex::compute2::<<vortex_compute::*>>

From e18b2e3e0fd0f7d707ee757c92a7b91cb1c46522 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Fri, 13 Feb 2026 11:49:37 -0500
Subject: [PATCH 05/14] remove unused dep

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 Cargo.lock             | 1 -
 vortex-cuda/Cargo.toml | 1 -
 2 files changed, 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c5081a7a5c2..ee23dc3927e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10145,7 +10145,6 @@ dependencies = [
  "arrow-schema",
  "async-trait",
  "bindgen",
- "bytes",
  "codspeed-criterion-compat-walltime",
  "cudarc",
  "fastlanes",
diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml
index bd4afb972f6..6f6e9f92549 100644
--- a/vortex-cuda/Cargo.toml
+++ b/vortex-cuda/Cargo.toml
@@ -28,7 +28,6 @@ arc-swap = { workspace = true }
 arrow-data = { workspace = true, features = ["ffi"] }
 arrow-schema = { workspace = true, features = ["ffi"] }
 async-trait = { workspace = true }
-bytes = { workspace = true }
 cudarc = { workspace = true, features = ["f16"] }
 fastlanes = { workspace = true }
 flatbuffers = { workspace = true }

From 444bd7d28fb6a8cca7931f3afc2b3d3947ec46fd Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Sun, 15 Feb 2026 14:37:57 -0500
Subject: [PATCH 06/14] don't build CLI on windows

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 vortex-cuda/gpu-scan-cli/Cargo.toml  |  5 ++++-
 vortex-cuda/gpu-scan-cli/src/main.rs | 16 ++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/vortex-cuda/gpu-scan-cli/Cargo.toml b/vortex-cuda/gpu-scan-cli/Cargo.toml
index 1ed2dd3e1a4..feb1349f22a 100644
--- a/vortex-cuda/gpu-scan-cli/Cargo.toml
+++ b/vortex-cuda/gpu-scan-cli/Cargo.toml
@@ -15,10 +15,13 @@ version = { workspace = true }
 [lints]
 workspace = true
 
+[features]
+cuda = ["dep:vortex-cuda"]
+
 [dependencies]
 futures = { workspace = true, features = ["executor"] }
 tokio = { workspace = true, features = ["macros", "full"] }
 tracing = { workspace = true, features = ["std", "attributes"] }
 tracing-subscriber = { workspace = true, features = ["env-filter", "json"] }
 vortex = { workspace = true }
-vortex-cuda = { workspace = true, features = ["_test-harness", "tracing"] }
+vortex-cuda = { workspace = true, features = ["_test-harness", "tracing"], optional = true }
diff --git a/vortex-cuda/gpu-scan-cli/src/main.rs b/vortex-cuda/gpu-scan-cli/src/main.rs
index b82af917193..8cbec8b8819 100644
--- a/vortex-cuda/gpu-scan-cli/src/main.rs
+++ b/vortex-cuda/gpu-scan-cli/src/main.rs
@@ -1,9 +1,12 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+#![allow(unused_imports)]
+
 use std::env::args;
 use std::path::Path;
 use std::path::PathBuf;
+use std::process::exit;
 use std::sync::Arc;
 
 use futures::StreamExt;
@@ -25,12 +28,24 @@ use vortex::file::OpenOptionsSessionExt;
 use vortex::file::WriteOptionsSessionExt;
 use vortex::file::WriteStrategyBuilder;
 use vortex::session::VortexSession;
+#[cfg(feature = "cuda")]
 use vortex_cuda::CopyDeviceReadAt;
+#[cfg(feature = "cuda")]
 use vortex_cuda::CudaSession;
+#[cfg(feature = "cuda")]
 use vortex_cuda::TracingLaunchStrategy;
+#[cfg(feature = "cuda")]
 use vortex_cuda::VortexCudaStreamPool;
+#[cfg(feature = "cuda")]
 use vortex_cuda::executor::CudaArrayExt;
 
+#[cfg(not(feature = "cuda"))]
+pub fn main() {
+    eprintln!("this CLI requires being built with the `cuda` feature enabled");
+    exit(1);
+}
+
+#[cfg(feature = "cuda")]
 #[tokio::main]
 pub async fn main() -> VortexResult<()> {
     let args: Vec<String> = args().collect();
@@ -120,6 +135,7 @@ pub async fn main() -> VortexResult<()> {
 
 /// Recompress the input file using only GPU-executable encodings, returning the file as an
 /// in-memory byte array.
+#[cfg(feature = "cuda")]
 async fn recompress_for_gpu(
     input_path: impl AsRef<Path>,
     session: &VortexSession,

From 10a685f6503ea981dcc03199d387c2c731975715 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Sun, 15 Feb 2026 14:52:23 -0500
Subject: [PATCH 07/14] lint

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 vortex-cuda/gpu-scan-cli/Cargo.toml  | 5 ++++-
 vortex-cuda/gpu-scan-cli/src/main.rs | 8 ++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/vortex-cuda/gpu-scan-cli/Cargo.toml b/vortex-cuda/gpu-scan-cli/Cargo.toml
index feb1349f22a..44f5db7aa0e 100644
--- a/vortex-cuda/gpu-scan-cli/Cargo.toml
+++ b/vortex-cuda/gpu-scan-cli/Cargo.toml
@@ -24,4 +24,7 @@ tokio = { workspace = true, features = ["macros", "full"] }
 tracing = { workspace = true, features = ["std", "attributes"] }
 tracing-subscriber = { workspace = true, features = ["env-filter", "json"] }
 vortex = { workspace = true }
-vortex-cuda = { workspace = true, features = ["_test-harness", "tracing"], optional = true }
+vortex-cuda = { workspace = true, features = [
+    "_test-harness",
+    "tracing",
+], optional = true }
diff --git a/vortex-cuda/gpu-scan-cli/src/main.rs b/vortex-cuda/gpu-scan-cli/src/main.rs
index 8cbec8b8819..f878d87eb72 100644
--- a/vortex-cuda/gpu-scan-cli/src/main.rs
+++ b/vortex-cuda/gpu-scan-cli/src/main.rs
@@ -6,7 +6,6 @@
 use std::env::args;
 use std::path::Path;
 use std::path::PathBuf;
-use std::process::exit;
 use std::sync::Arc;
 
 use futures::StreamExt;
@@ -40,14 +39,15 @@ use vortex_cuda::VortexCudaStreamPool;
 use vortex_cuda::executor::CudaArrayExt;
 
 #[cfg(not(feature = "cuda"))]
-pub fn main() {
+#[allow(clippy::exit)]
+fn main() {
     eprintln!("this CLI requires being built with the `cuda` feature enabled");
-    exit(1);
+    std::process::exit(1);
 }
 
 #[cfg(feature = "cuda")]
 #[tokio::main]
-pub async fn main() -> VortexResult<()> {
+async fn main() -> VortexResult<()> {
     let args: Vec<String> = args().collect();
     let json_output = args.iter().any(|arg| arg == "--json");
 

From e619d0e05989813d353e8b275e26b62a66965241 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Sun, 15 Feb 2026 15:10:11 -0500
Subject: [PATCH 08/14] skip

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 284af908f92..72391d51dbc 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -627,7 +627,7 @@ jobs:
             --exclude vortex-cub --exclude vortex-test-e2e-cuda --exclude duckdb-bench `
             --exclude lance-bench --exclude datafusion-bench --exclude random-access-bench `
             --exclude compress-bench --exclude xtask --exclude vortex-datafusion `
-            --exclude vortex-sqllogictest
+            --exclude gpu-scan-cli --exclude vortex-sqllogictest
       - name: Rust Tests (Other)
         if: matrix.os != 'windows-x64'
         run: |

From 24b3a63d0b773237376dc7b23abd5f4f315d9779 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Mon, 16 Feb 2026 13:08:01 -0500
Subject: [PATCH 09/14] address comments

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 Cargo.lock                               |  1 +
 vortex-cuda/gpu-scan-cli/Cargo.toml      |  6 ++--
 vortex-cuda/gpu-scan-cli/src/main.rs     | 46 +++++-------------------
 vortex-cuda/src/kernel/encodings/zstd.rs |  8 +++--
 4 files changed, 17 insertions(+), 44 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ee23dc3927e..ea88c2c94a0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4128,6 +4128,7 @@ dependencies = [
  "tracing-subscriber",
  "vortex",
  "vortex-cuda",
+ "vortex-cuda-macros",
 ]
 
 [[package]]
diff --git a/vortex-cuda/gpu-scan-cli/Cargo.toml b/vortex-cuda/gpu-scan-cli/Cargo.toml
index 44f5db7aa0e..59918216ea0 100644
--- a/vortex-cuda/gpu-scan-cli/Cargo.toml
+++ b/vortex-cuda/gpu-scan-cli/Cargo.toml
@@ -15,9 +15,6 @@ version = { workspace = true }
 [lints]
 workspace = true
 
-[features]
-cuda = ["dep:vortex-cuda"]
-
 [dependencies]
 futures = { workspace = true, features = ["executor"] }
 tokio = { workspace = true, features = ["macros", "full"] }
@@ -27,4 +24,5 @@ vortex = { workspace = true }
 vortex-cuda = { workspace = true, features = [
     "_test-harness",
     "tracing",
-], optional = true }
+] }
+vortex-cuda-macros = { workspace = true }
diff --git a/vortex-cuda/gpu-scan-cli/src/main.rs b/vortex-cuda/gpu-scan-cli/src/main.rs
index f878d87eb72..a5755b31856 100644
--- a/vortex-cuda/gpu-scan-cli/src/main.rs
+++ b/vortex-cuda/gpu-scan-cli/src/main.rs
@@ -3,49 +3,19 @@
 
 #![allow(unused_imports)]
 
-use std::env::args;
-use std::path::Path;
-use std::path::PathBuf;
-use std::sync::Arc;
-
-use futures::StreamExt;
-use tracing::Instrument;
-use tracing_subscriber::EnvFilter;
-use tracing_subscriber::fmt::format::FmtSpan;
-use vortex::VortexSessionDefault;
-use vortex::array::ToCanonical;
-use vortex::array::arrays::DictVTable;
-use vortex::buffer::ByteBuffer;
-use vortex::buffer::ByteBufferMut;
-use vortex::compressor::BtrBlocksCompressorBuilder;
-use vortex::compressor::FloatCode;
-use vortex::compressor::IntCode;
-use vortex::compressor::StringCode;
-use vortex::error::VortexResult;
-use vortex::file::Footer;
-use vortex::file::OpenOptionsSessionExt;
-use vortex::file::WriteOptionsSessionExt;
-use vortex::file::WriteStrategyBuilder;
-use vortex::session::VortexSession;
-#[cfg(feature = "cuda")]
+#[cuda_available]
 use vortex_cuda::CopyDeviceReadAt;
-#[cfg(feature = "cuda")]
+#[cuda_available]
 use vortex_cuda::CudaSession;
-#[cfg(feature = "cuda")]
+#[cuda_available]
 use vortex_cuda::TracingLaunchStrategy;
-#[cfg(feature = "cuda")]
+#[cuda_available]
 use vortex_cuda::VortexCudaStreamPool;
-#[cfg(feature = "cuda")]
+#[cuda_available]
 use vortex_cuda::executor::CudaArrayExt;
+use vortex_cuda_macros::cuda_available;
 
-#[cfg(not(feature = "cuda"))]
-#[allow(clippy::exit)]
-fn main() {
-    eprintln!("this CLI requires being built with the `cuda` feature enabled");
-    std::process::exit(1);
-}
-
-#[cfg(feature = "cuda")]
+#[cuda_available]
 #[tokio::main]
 async fn main() -> VortexResult<()> {
     let args: Vec<String> = args().collect();
@@ -135,7 +105,7 @@ async fn main() -> VortexResult<()> {
 
 /// Recompress the input file using only GPU-executable encodings, returning the file as an
 /// in-memory byte array.
-#[cfg(feature = "cuda")]
+#[cuda_available]
 async fn recompress_for_gpu(
     input_path: impl AsRef<Path>,
     session: &VortexSession,
diff --git a/vortex-cuda/src/kernel/encodings/zstd.rs b/vortex-cuda/src/kernel/encodings/zstd.rs
index c9980415c62..01cece1d39b 100644
--- a/vortex-cuda/src/kernel/encodings/zstd.rs
+++ b/vortex-cuda/src/kernel/encodings/zstd.rs
@@ -261,7 +261,9 @@ async fn decode_zstd(array: ZstdArray, ctx: &mut CudaExecutionCtx) -> VortexResu
     #[cfg(feature = "tracing")]
     {
         let before = stream
-            .record_event(Some(cudarc::driver::sys::CUevent_flags::CU_EVENT_DEFAULT))
+            .record_event(Some(
+                cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC,
+            ))
             .map_err(|e| vortex_err!("recording event: {e}"))?;
         unsafe {
             nvcomp_zstd::decompress_async(
@@ -280,7 +282,9 @@ async fn decode_zstd(array: ZstdArray, ctx: &mut CudaExecutionCtx) -> VortexResu
         }
 
         let after = stream
-            .record_event(Some(cudarc::driver::sys::CUevent_flags::CU_EVENT_DEFAULT))
+            .record_event(Some(
+                cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC,
+            ))
             .map_err(|e| vortex_err!("recording event: {e}"))?;
 
         // measure timing. note: this forces a sync

From a6a3917dd71b6f9b397bb848700fd0ba3a07948b Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Mon, 16 Feb 2026 13:38:23 -0500
Subject: [PATCH 10/14] address harder

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 vortex-cuda/Cargo.toml                        |   1 -
 vortex-cuda/gpu-scan-cli/Cargo.toml           |   5 +-
 vortex-cuda/gpu-scan-cli/src/main.rs          | 162 ++++++++++--------
 vortex-cuda/src/executor.rs                   |  14 +-
 vortex-cuda/src/kernel/arrays/constant.rs     |   6 +-
 vortex-cuda/src/kernel/arrays/dict.rs         |   6 +-
 vortex-cuda/src/kernel/arrays/shared.rs       |   6 +-
 vortex-cuda/src/kernel/encodings/alp.rs       |   6 +-
 vortex-cuda/src/kernel/encodings/bitpacked.rs |   6 +-
 .../src/kernel/encodings/date_time_parts.rs   |   6 +-
 .../kernel/encodings/decimal_byte_parts.rs    |   6 +-
 vortex-cuda/src/kernel/encodings/for_.rs      |   6 +-
 vortex-cuda/src/kernel/encodings/runend.rs    |   6 +-
 vortex-cuda/src/kernel/encodings/sequence.rs  |   6 +-
 vortex-cuda/src/kernel/encodings/zigzag.rs    |   6 +-
 vortex-cuda/src/kernel/encodings/zstd.rs      |  60 +------
 vortex-cuda/src/kernel/filter/mod.rs          |   6 +-
 vortex-cuda/src/kernel/mod.rs                 |  41 ++++-
 vortex-cuda/src/kernel/slice/mod.rs           |   6 +-
 vortex-cuda/src/lib.rs                        |   3 +-
 vortex-cuda/src/macros.rs                     |  42 -----
 vortex-cuda/src/stream.rs                     |   4 +-
 22 files changed, 174 insertions(+), 236 deletions(-)
 delete mode 100644 vortex-cuda/src/macros.rs

diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml
index 6f6e9f92549..5d042de0a56 100644
--- a/vortex-cuda/Cargo.toml
+++ b/vortex-cuda/Cargo.toml
@@ -19,7 +19,6 @@ workspace = true
 
 [features]
 default = []
-tracing = ["dep:tracing"]
 _test-harness = []
 unstable_encodings = ["vortex-zstd/unstable_encodings"]
 
diff --git a/vortex-cuda/gpu-scan-cli/Cargo.toml b/vortex-cuda/gpu-scan-cli/Cargo.toml
index 59918216ea0..0acd85438da 100644
--- a/vortex-cuda/gpu-scan-cli/Cargo.toml
+++ b/vortex-cuda/gpu-scan-cli/Cargo.toml
@@ -21,8 +21,5 @@ tokio = { workspace = true, features = ["macros", "full"] }
 tracing = { workspace = true, features = ["std", "attributes"] }
 tracing-subscriber = { workspace = true, features = ["env-filter", "json"] }
 vortex = { workspace = true }
-vortex-cuda = { workspace = true, features = [
-    "_test-harness",
-    "tracing",
-] }
+vortex-cuda = { workspace = true, features = ["_test-harness"] }
 vortex-cuda-macros = { workspace = true }
diff --git a/vortex-cuda/gpu-scan-cli/src/main.rs b/vortex-cuda/gpu-scan-cli/src/main.rs
index a5755b31856..3a75a4c0d38 100644
--- a/vortex-cuda/gpu-scan-cli/src/main.rs
+++ b/vortex-cuda/gpu-scan-cli/src/main.rs
@@ -3,99 +3,111 @@
 
 #![allow(unused_imports)]
 
+use std::env::args;
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use futures::StreamExt;
+use tracing::Instrument;
+use tracing_subscriber::EnvFilter;
+use tracing_subscriber::fmt::format::FmtSpan;
+use vortex::VortexSessionDefault;
+use vortex::array::ToCanonical;
+use vortex::array::arrays::DictVTable;
+use vortex::error::VortexResult;
+use vortex::file::OpenOptionsSessionExt;
+use vortex::session::VortexSession;
 #[cuda_available]
 use vortex_cuda::CopyDeviceReadAt;
-#[cuda_available]
 use vortex_cuda::CudaSession;
-#[cuda_available]
 use vortex_cuda::TracingLaunchStrategy;
-#[cuda_available]
 use vortex_cuda::VortexCudaStreamPool;
-#[cuda_available]
 use vortex_cuda::executor::CudaArrayExt;
 use vortex_cuda_macros::cuda_available;
 
-#[cuda_available]
 #[tokio::main]
 async fn main() -> VortexResult<()> {
-    let args: Vec<String> = args().collect();
-    let json_output = args.iter().any(|arg| arg == "--json");
-
-    if json_output {
-        tracing_subscriber::fmt()
-            .json()
-            .with_env_filter(EnvFilter::from_default_env())
-            .with_span_events(FmtSpan::NONE)
-            .with_ansi(false)
-            .init();
-    } else {
-        tracing_subscriber::fmt()
-            .pretty()
-            .with_env_filter(EnvFilter::from_default_env())
-            .with_span_events(FmtSpan::NONE)
-            .with_ansi(false)
-            .event_format(tracing_subscriber::fmt::format().with_target(true))
-            .init();
-    }
-
-    let session = VortexSession::default();
-    let mut cuda_ctx = CudaSession::create_execution_ctx(&session)?
-        .with_launch_strategy(Arc::new(TracingLaunchStrategy));
-
-    #[allow(clippy::expect_used, clippy::unwrap_in_result)]
-    let input_path = args
-        .iter()
-        .skip(1)
-        .find(|arg| !arg.starts_with("--"))
-        .expect("must provide path to .vortex file");
-    let input_path = PathBuf::from(input_path);
-
-    assert!(input_path.exists(), "input path does not exist");
-
-    let (recompressed, footer) = recompress_for_gpu(input_path, &session).await?;
-
-    // Create a full scan that executes on the GPU
-    let cuda_stream =
-        VortexCudaStreamPool::new(Arc::clone(cuda_ctx.stream().context()), 1).get_stream()?;
-    let gpu_reader = CopyDeviceReadAt::new(recompressed, cuda_stream);
-
-    let gpu_file = session
-        .open_options()
-        .with_footer(footer)
-        .open(Arc::new(gpu_reader))
-        .await?;
-
-    // execute_micros => µs to execute
-    let mut batches = gpu_file.scan()?.into_array_stream()?;
+    #[cuda_available]
+    {
+        let args: Vec<String> = args().collect();
+        let json_output = args.iter().any(|arg| arg == "--json");
+
+        if json_output {
+            tracing_subscriber::fmt()
+                .json()
+                .with_env_filter(EnvFilter::from_default_env())
+                .with_span_events(FmtSpan::NONE)
+                .with_ansi(false)
+                .init();
+        } else {
+            tracing_subscriber::fmt()
+                .pretty()
+                .with_env_filter(EnvFilter::from_default_env())
+                .with_span_events(FmtSpan::NONE)
+                .with_ansi(false)
+                .event_format(tracing_subscriber::fmt::format().with_target(true))
+                .init();
+        }
 
-    let mut chunk = 0;
-    while let Some(next) = batches.next().await.transpose()? {
-        let record = next.to_struct();
+        let session = VortexSession::default();
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&session)?
+            .with_launch_strategy(Arc::new(TracingLaunchStrategy));
 
-        for (field, field_name) in record
-            .unmasked_fields()
+        #[allow(clippy::expect_used, clippy::unwrap_in_result)]
+        let input_path = args
             .iter()
-            .zip(record.struct_fields().names().iter())
-        {
-            let field_name = field_name.to_string();
-            // skip dict, varbin isn't properly implemented.
-            if field.is::<DictVTable>() {
-                continue;
-            }
+            .skip(1)
+            .find(|arg| !arg.starts_with("--"))
+            .expect("must provide path to .vortex file");
+        let input_path = PathBuf::from(input_path);
+
+        assert!(input_path.exists(), "input path does not exist");
+
+        let (recompressed, footer) = recompress_for_gpu(input_path, &session).await?;
+
+        // Create a full scan that executes on the GPU
+        let cuda_stream =
+            VortexCudaStreamPool::new(Arc::clone(cuda_ctx.stream().context()), 1).get_stream()?;
+        let gpu_reader = CopyDeviceReadAt::new(recompressed, cuda_stream);
+
+        let gpu_file = session
+            .open_options()
+            .with_footer(footer)
+            .open(Arc::new(gpu_reader))
+            .await?;
+
+        // execute_micros => µs to execute
+        let mut batches = gpu_file.scan()?.into_array_stream()?;
+
+        let mut chunk = 0;
+        while let Some(next) = batches.next().await.transpose()? {
+            let record = next.to_struct();
+
+            for (field, field_name) in record
+                .unmasked_fields()
+                .iter()
+                .zip(record.struct_fields().names().iter())
+            {
+                let field_name = field_name.to_string();
+                // skip dict, varbin isn't properly implemented.
+                if field.is::<DictVTable>() {
+                    continue;
+                }
 
-            let span =
-                tracing::info_span!("array execution", chunk = chunk, field_name = field_name);
+                let span =
+                    tracing::info_span!("array execution", chunk = chunk, field_name = field_name);
 
-            async {
-                if field.clone().execute_cuda(&mut cuda_ctx).await.is_err() {
-                    tracing::error!("failed to execute_cuda on column");
+                async {
+                    if field.clone().execute_cuda(&mut cuda_ctx).await.is_err() {
+                        tracing::error!("failed to execute_cuda on column");
+                    }
                 }
+                .instrument(span)
+                .await;
             }
-            .instrument(span)
-            .await;
-        }
 
-        chunk += 1;
+            chunk += 1;
+        }
     }
 
     Ok(())
diff --git a/vortex-cuda/src/executor.rs b/vortex-cuda/src/executor.rs
index 2afa5bd048d..a90f32a6a2c 100644
--- a/vortex-cuda/src/executor.rs
+++ b/vortex-cuda/src/executor.rs
@@ -14,6 +14,8 @@ use cudarc::driver::DeviceRepr;
 use cudarc::driver::LaunchArgs;
 use cudarc::driver::LaunchConfig;
 use futures::future::BoxFuture;
+use tracing::debug;
+use tracing::trace;
 use vortex_array::Array;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
@@ -29,14 +31,13 @@ use vortex_error::vortex_err;
 
 use crate::CudaSession;
 use crate::ExportDeviceArray;
-use crate::debug;
 use crate::kernel::DefaultLaunchStrategy;
 use crate::kernel::LaunchStrategy;
+use crate::kernel::LaunchStrategyExt;
 use crate::kernel::launch_cuda_kernel_impl;
 use crate::kernel::launch_cuda_kernel_with_config;
 use crate::session::CudaSessionExt;
 use crate::stream::VortexCudaStream;
-use crate::trace;
 
 /// CUDA kernel events recorded before and after kernel launch.
 #[derive(Debug)]
@@ -88,6 +89,15 @@ impl CudaExecutionCtx {
         self
     }
 
+    /// Perform an external kernel launch, with events created and logged via the configured
+    /// [`LaunchStrategy`].
+    ///
+    /// We use CUB and NVCOMP routines, and those don't match the normaal `cudarc` entrypoints, so
+    /// to inject the configured launch strategy we need to bracket it ourselves.
+    pub fn launch_external<F: FnMut()>(&self, len: usize, function: F) -> VortexResult<()> {
+        self.strategy.with_strategy(&self.stream.0, len, function)
+    }
+
     /// Launch a Kernel function with args setup done by the provided `build_args` closure.
     ///
     /// Kernels launched this way will use the default launch configuration, which provides no
diff --git a/vortex-cuda/src/kernel/arrays/constant.rs b/vortex-cuda/src/kernel/arrays/constant.rs
index 8f0451d31e6..9c6a43525d6 100644
--- a/vortex-cuda/src/kernel/arrays/constant.rs
+++ b/vortex-cuda/src/kernel/arrays/constant.rs
@@ -7,6 +7,7 @@ use std::sync::Arc;
 use async_trait::async_trait;
 use cudarc::driver::DeviceRepr;
 use cudarc::driver::PushKernelArg;
+use tracing::instrument;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
 use vortex_array::arrays::ConstantArray;
@@ -46,10 +47,7 @@ impl ConstantNumericExecutor {
 
 #[async_trait]
 impl CudaExecute for ConstantNumericExecutor {
-    #[cfg_attr(
-        feature = "tracing",
-        tracing::instrument(level = "trace", skip_all, fields(self))
-    )]
+    #[instrument(level = "trace", skip_all, fields(executor = ?self))]
     async fn execute(
         &self,
         array: ArrayRef,
diff --git a/vortex-cuda/src/kernel/arrays/dict.rs b/vortex-cuda/src/kernel/arrays/dict.rs
index 29a72d9cf44..2d13c0de216 100644
--- a/vortex-cuda/src/kernel/arrays/dict.rs
+++ b/vortex-cuda/src/kernel/arrays/dict.rs
@@ -6,6 +6,7 @@ use std::sync::Arc;
 use async_trait::async_trait;
 use cudarc::driver::DeviceRepr;
 use cudarc::driver::PushKernelArg;
+use tracing::instrument;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
 use vortex_array::arrays::DecimalArray;
@@ -41,10 +42,7 @@ pub(crate) struct DictExecutor;
 
 #[async_trait]
 impl CudaExecute for DictExecutor {
-    #[cfg_attr(
-        feature = "tracing",
-        tracing::instrument(level = "trace", skip_all, fields(self))
-    )]
+    #[instrument(level = "trace", skip_all, fields(executor = ?self))]
     async fn execute(
         &self,
         array: ArrayRef,
diff --git a/vortex-cuda/src/kernel/arrays/shared.rs b/vortex-cuda/src/kernel/arrays/shared.rs
index cae0dc68988..914f5edbd32 100644
--- a/vortex-cuda/src/kernel/arrays/shared.rs
+++ b/vortex-cuda/src/kernel/arrays/shared.rs
@@ -2,6 +2,7 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 use async_trait::async_trait;
+use tracing::instrument;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
 use vortex_array::arrays::SharedVTable;
@@ -18,10 +19,7 @@ pub(crate) struct SharedExecutor;
 
 #[async_trait]
 impl CudaExecute for SharedExecutor {
-    #[cfg_attr(
-        feature = "tracing",
-        tracing::instrument(level = "trace", skip_all, fields(self))
-    )]
+    #[instrument(level = "trace", skip_all, fields(executor = ?self))]
     async fn execute(
         &self,
         array: ArrayRef,
diff --git a/vortex-cuda/src/kernel/encodings/alp.rs b/vortex-cuda/src/kernel/encodings/alp.rs
index ff95fd9ed41..65192f7445d 100644
--- a/vortex-cuda/src/kernel/encodings/alp.rs
+++ b/vortex-cuda/src/kernel/encodings/alp.rs
@@ -7,6 +7,7 @@ use std::sync::Arc;
 use async_trait::async_trait;
 use cudarc::driver::DeviceRepr;
 use cudarc::driver::PushKernelArg;
+use tracing::instrument;
 use vortex_alp::ALPArray;
 use vortex_alp::ALPFloat;
 use vortex_alp::ALPVTable;
@@ -37,10 +38,7 @@ pub(crate) struct ALPExecutor;
 
 #[async_trait]
 impl CudaExecute for ALPExecutor {
-    #[cfg_attr(
-        feature = "tracing",
-        tracing::instrument(level = "trace", skip_all, fields(self))
-    )]
+    #[instrument(level = "trace", skip_all, fields(executor = ?self))]
     async fn execute(
         &self,
         array: ArrayRef,
diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs
index 9f3f8641fa3..775e9ea665b 100644
--- a/vortex-cuda/src/kernel/encodings/bitpacked.rs
+++ b/vortex-cuda/src/kernel/encodings/bitpacked.rs
@@ -9,6 +9,7 @@ use cudarc::driver::CudaFunction;
 use cudarc::driver::DeviceRepr;
 use cudarc::driver::LaunchConfig;
 use cudarc::driver::PushKernelArg;
+use tracing::instrument;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
 use vortex_array::arrays::PrimitiveArray;
@@ -45,10 +46,7 @@ impl BitPackedExecutor {
 
 #[async_trait]
 impl CudaExecute for BitPackedExecutor {
-    #[cfg_attr(
-        feature = "tracing",
-        tracing::instrument(level = "trace", skip_all, fields(self))
-    )]
+    #[instrument(level = "trace", skip_all, fields(executor = ?self))]
     async fn execute(
         &self,
         array: ArrayRef,
diff --git a/vortex-cuda/src/kernel/encodings/date_time_parts.rs b/vortex-cuda/src/kernel/encodings/date_time_parts.rs
index ab5d1da1785..ce0589eda76 100644
--- a/vortex-cuda/src/kernel/encodings/date_time_parts.rs
+++ b/vortex-cuda/src/kernel/encodings/date_time_parts.rs
@@ -6,6 +6,7 @@ use std::sync::Arc;
 use async_trait::async_trait;
 use cudarc::driver::DeviceRepr;
 use cudarc::driver::PushKernelArg;
+use tracing::instrument;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
 use vortex_array::IntoArray;
@@ -43,10 +44,7 @@ pub(crate) struct DateTimePartsExecutor;
 
 #[async_trait]
 impl CudaExecute for DateTimePartsExecutor {
-    #[cfg_attr(
-        feature = "tracing",
-        tracing::instrument(level = "trace", skip_all, fields(self))
-    )]
+    #[instrument(level = "trace", skip_all, fields(executor = ?self))]
     async fn execute(
         &self,
         array: ArrayRef,
diff --git a/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs b/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs
index 042de8802d2..9a50d30aec7 100644
--- a/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs
+++ b/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs
@@ -4,6 +4,7 @@
 use std::fmt::Debug;
 
 use async_trait::async_trait;
+use tracing::instrument;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
 use vortex_array::arrays::DecimalArray;
@@ -24,10 +25,7 @@ pub(crate) struct DecimalBytePartsExecutor;
 
 #[async_trait]
 impl CudaExecute for DecimalBytePartsExecutor {
-    #[cfg_attr(
-        feature = "tracing",
-        tracing::instrument(level = "trace", skip_all, fields(self))
-    )]
+    #[instrument(level = "trace", skip_all, fields(executor = ?self))]
     async fn execute(
         &self,
         array: ArrayRef,
diff --git a/vortex-cuda/src/kernel/encodings/for_.rs b/vortex-cuda/src/kernel/encodings/for_.rs
index d5d8aa7ce48..30001941863 100644
--- a/vortex-cuda/src/kernel/encodings/for_.rs
+++ b/vortex-cuda/src/kernel/encodings/for_.rs
@@ -6,6 +6,7 @@ use std::fmt::Debug;
 use async_trait::async_trait;
 use cudarc::driver::DeviceRepr;
 use cudarc::driver::PushKernelArg;
+use tracing::instrument;
 use vortex_array::Array;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
@@ -38,10 +39,7 @@ impl FoRExecutor {
 
 #[async_trait]
 impl CudaExecute for FoRExecutor {
-    #[cfg_attr(
-        feature = "tracing",
-        tracing::instrument(level = "trace", skip_all, fields(self))
-    )]
+    #[instrument(level = "trace", skip_all, fields(executor = ?self))]
     async fn execute(
         &self,
         array: ArrayRef,
diff --git a/vortex-cuda/src/kernel/encodings/runend.rs b/vortex-cuda/src/kernel/encodings/runend.rs
index 9da8b636cd1..aed2fb226a7 100644
--- a/vortex-cuda/src/kernel/encodings/runend.rs
+++ b/vortex-cuda/src/kernel/encodings/runend.rs
@@ -6,6 +6,7 @@ use std::sync::Arc;
 use async_trait::async_trait;
 use cudarc::driver::DeviceRepr;
 use cudarc::driver::PushKernelArg;
+use tracing::instrument;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
 use vortex_array::arrays::ConstantArray;
@@ -45,10 +46,7 @@ impl RunEndExecutor {
 
 #[async_trait]
 impl CudaExecute for RunEndExecutor {
-    #[cfg_attr(
-        feature = "tracing",
-        tracing::instrument(level = "trace", skip_all, fields(self))
-    )]
+    #[instrument(level = "trace", skip_all, fields(executor = ?self))]
     async fn execute(
         &self,
         array: ArrayRef,
diff --git a/vortex-cuda/src/kernel/encodings/sequence.rs b/vortex-cuda/src/kernel/encodings/sequence.rs
index 354e26b0629..0556aea2a49 100644
--- a/vortex-cuda/src/kernel/encodings/sequence.rs
+++ b/vortex-cuda/src/kernel/encodings/sequence.rs
@@ -6,6 +6,7 @@ use std::sync::Arc;
 use async_trait::async_trait;
 use cudarc::driver::DeviceRepr;
 use cudarc::driver::PushKernelArg;
+use tracing::instrument;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
 use vortex_array::arrays::PrimitiveArray;
@@ -29,10 +30,7 @@ pub(crate) struct SequenceExecutor;
 
 #[async_trait]
 impl CudaExecute for SequenceExecutor {
-    #[cfg_attr(
-        feature = "tracing",
-        tracing::instrument(level = "trace", skip_all, fields(self))
-    )]
+    #[instrument(level = "trace", skip_all, fields(executor = ?self))]
     async fn execute(
         &self,
         array: ArrayRef,
diff --git a/vortex-cuda/src/kernel/encodings/zigzag.rs b/vortex-cuda/src/kernel/encodings/zigzag.rs
index afbbc0458e5..393a87eed43 100644
--- a/vortex-cuda/src/kernel/encodings/zigzag.rs
+++ b/vortex-cuda/src/kernel/encodings/zigzag.rs
@@ -6,6 +6,7 @@ use std::fmt::Debug;
 use async_trait::async_trait;
 use cudarc::driver::DeviceRepr;
 use cudarc::driver::PushKernelArg;
+use tracing::instrument;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
 use vortex_array::arrays::PrimitiveArray;
@@ -37,10 +38,7 @@ impl ZigZagExecutor {
 
 #[async_trait]
 impl CudaExecute for ZigZagExecutor {
-    #[cfg_attr(
-        feature = "tracing",
-        tracing::instrument(level = "trace", skip_all, fields(self))
-    )]
+    #[instrument(level = "trace", skip_all, fields(executor = ?self))]
     async fn execute(
         &self,
         array: ArrayRef,
diff --git a/vortex-cuda/src/kernel/encodings/zstd.rs b/vortex-cuda/src/kernel/encodings/zstd.rs
index 01cece1d39b..e6efdf47c58 100644
--- a/vortex-cuda/src/kernel/encodings/zstd.rs
+++ b/vortex-cuda/src/kernel/encodings/zstd.rs
@@ -11,6 +11,9 @@ use cudarc::driver::CudaSlice;
 use cudarc::driver::DevicePtr;
 use cudarc::driver::DevicePtrMut;
 use futures::future::try_join_all;
+use tracing::debug;
+use tracing::instrument;
+use tracing::trace;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
 use vortex_array::arrays::BinaryView;
@@ -35,7 +38,6 @@ use vortex_zstd::ZstdVTable;
 
 use crate::CudaBufferExt;
 use crate::CudaDeviceBuffer;
-use crate::debug;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
 
@@ -197,10 +199,7 @@ impl ZstdExecutor {
 
 #[async_trait]
 impl CudaExecute for ZstdExecutor {
-    #[cfg_attr(
-        feature = "tracing",
-        tracing::instrument(level = "trace", skip_all, fields(self))
-    )]
+    #[instrument(level = "trace", skip_all, fields(executor = ?self))]
     async fn execute(
         &self,
         array: ArrayRef,
@@ -255,54 +254,9 @@ async fn decode_zstd(array: ZstdArray, ctx: &mut CudaExecutionCtx) -> VortexResu
 
     let stream = ctx.stream();
 
-    // NOTE(aduffy): we need to use the explicit tracing/not(tracing) blocks here because we go
-    //  through nvcomp instead of delegating through the LaunchBuilder.
-    //  We should find a way to bridge the two.
-    #[cfg(feature = "tracing")]
-    {
-        let before = stream
-            .record_event(Some(
-                cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC,
-            ))
-            .map_err(|e| vortex_err!("recording event: {e}"))?;
+    ctx.launch_external(n_rows, ||
+        // SAFETY: zstd_kernel_prepare makes sure to return valid kernel params.
         unsafe {
-            nvcomp_zstd::decompress_async(
-                exec.frame_ptrs_ptr as _,
-                exec.frame_sizes_ptr as _,
-                exec.output_sizes_ptr as _,
-                exec.device_actual_sizes.device_ptr_mut(stream).0 as _,
-                exec.num_frames,
-                exec.nvcomp_temp_buffer.device_ptr_mut(stream).0 as _,
-                exec.nvcomp_temp_buffer_size,
-                exec.output_ptrs_ptr as _,
-                exec.device_statuses.device_ptr_mut(stream).0 as _,
-                stream.cu_stream().cast(),
-            )
-            .map_err(|e| vortex_err!("nvcomp decompress_async failed: {}", e))?;
-        }
-
-        let after = stream
-            .record_event(Some(
-                cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC,
-            ))
-            .map_err(|e| vortex_err!("recording event: {e}"))?;
-
-        // measure timing. note: this forces a sync
-        let duration = crate::CudaKernelEvents {
-            before_launch: before,
-            after_launch: after,
-        }
-        .duration()?;
-
-        crate::trace!(
-            execution_nanos = duration.as_nanos(),
-            len = n_rows,
-            "ZSTD execution"
-        );
-    }
-
-    #[cfg(not(feature = "tracing"))]
-    unsafe {
         nvcomp_zstd::decompress_async(
             exec.frame_ptrs_ptr as _,
             exec.frame_sizes_ptr as _,
@@ -316,7 +270,7 @@ async fn decode_zstd(array: ZstdArray, ctx: &mut CudaExecutionCtx) -> VortexResu
             stream.cu_stream().cast(),
         )
         .map_err(|e| vortex_err!("nvcomp decompress_async failed: {}", e))?;
-    }
+    })?;
 
     // Unconditionally copy back to the host as Zstd arrays are fully
     // self-contained. They neither have any parent or child encodings.
diff --git a/vortex-cuda/src/kernel/filter/mod.rs b/vortex-cuda/src/kernel/filter/mod.rs
index 2e76a659b9b..ff080d52aae 100644
--- a/vortex-cuda/src/kernel/filter/mod.rs
+++ b/vortex-cuda/src/kernel/filter/mod.rs
@@ -15,6 +15,7 @@ use async_trait::async_trait;
 use cudarc::driver::DevicePtr;
 use cudarc::driver::DevicePtrMut;
 use cudarc::driver::DeviceRepr;
+use tracing::instrument;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
 use vortex_array::arrays::FilterArrayParts;
@@ -42,10 +43,7 @@ pub struct FilterExecutor;
 
 #[async_trait]
 impl CudaExecute for FilterExecutor {
-    #[cfg_attr(
-        feature = "tracing",
-        tracing::instrument(level = "trace", skip_all, fields(self))
-    )]
+    #[instrument(level = "trace", skip_all, fields(executor = ?self))]
     async fn execute(
         &self,
         array: ArrayRef,
diff --git a/vortex-cuda/src/kernel/mod.rs b/vortex-cuda/src/kernel/mod.rs
index 0bca57a15e1..8b1633f1d3d 100644
--- a/vortex-cuda/src/kernel/mod.rs
+++ b/vortex-cuda/src/kernel/mod.rs
@@ -11,10 +11,12 @@ use std::sync::Arc;
 use cudarc::driver::CudaContext;
 use cudarc::driver::CudaFunction;
 use cudarc::driver::CudaModule;
+use cudarc::driver::CudaStream;
 use cudarc::driver::LaunchArgs;
 use cudarc::driver::LaunchConfig;
 use cudarc::driver::sys::CUevent_flags;
 use cudarc::nvrtc::Ptx;
+use tracing::trace;
 use vortex_cuda_macros::cuda_tests;
 use vortex_error::VortexResult;
 use vortex_error::vortex_err;
@@ -36,8 +38,6 @@ pub(crate) use filter::FilterExecutor;
 pub(crate) use slice::SliceExecutor;
 
 use crate::CudaKernelEvents;
-#[cfg(feature = "tracing")]
-use crate::trace;
 
 /// Trait for customizing kernel launch behavior.
 ///
@@ -51,6 +51,41 @@ pub trait LaunchStrategy: Debug + Send + Sync + 'static {
     fn on_complete(&self, events: &CudaKernelEvents, len: usize) -> VortexResult<()>;
 }
 
+pub trait LaunchStrategyExt: LaunchStrategy {
+    fn with_strategy<F, R>(&self, stream: &CudaStream, len: usize, func: F) -> VortexResult<R>
+    where
+        F: FnMut() -> R;
+}
+
+impl<S: LaunchStrategyExt> LaunchStrategyExt for S {
+    fn with_strategy<F, R>(&self, stream: &CudaStream, len: usize, func: F) -> VortexResult<R>
+    where
+        F: FnMut() -> R,
+    {
+        let flags = self.event_flags();
+
+        let before = stream
+            .record_event(Some(flags))
+            .map_err(|e| vortex_err!("record_event: {e}"))?;
+
+        let result = func();
+
+        let after = stream
+            .record_event(Some(flags))
+            .map_err(|e| vortex_err!("record_event: {e}"))?;
+
+        self.on_complete(
+            &CudaKernelEvents {
+                before_launch: before,
+                after_launch: after,
+            },
+            len,
+        )?;
+
+        Ok(result)
+    }
+}
+
 /// Default launch strategy with no tracing overhead.
 #[derive(Debug)]
 pub struct DefaultLaunchStrategy;
@@ -66,11 +101,9 @@ impl LaunchStrategy for DefaultLaunchStrategy {
 }
 
 /// Launch strategy that records timing and emits trace events.
-#[cfg(feature = "tracing")]
 #[derive(Debug)]
 pub struct TracingLaunchStrategy;
 
-#[cfg(feature = "tracing")]
 impl LaunchStrategy for TracingLaunchStrategy {
     fn event_flags(&self) -> CUevent_flags {
         CUevent_flags::CU_EVENT_DEFAULT
diff --git a/vortex-cuda/src/kernel/slice/mod.rs b/vortex-cuda/src/kernel/slice/mod.rs
index 13922bba805..74f95bc525c 100644
--- a/vortex-cuda/src/kernel/slice/mod.rs
+++ b/vortex-cuda/src/kernel/slice/mod.rs
@@ -2,6 +2,7 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 use async_trait::async_trait;
+use tracing::instrument;
 use vortex_array::Array;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
@@ -19,10 +20,7 @@ pub struct SliceExecutor;
 
 #[async_trait]
 impl CudaExecute for SliceExecutor {
-    #[cfg_attr(
-        feature = "tracing",
-        tracing::instrument(level = "trace", skip_all, fields(self))
-    )]
+    #[instrument(level = "trace", skip_all, fields(executor = ?self))]
     async fn execute(
         &self,
         array: ArrayRef,
diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs
index 9b5b9c3bd6f..ebdf225c36a 100644
--- a/vortex-cuda/src/lib.rs
+++ b/vortex-cuda/src/lib.rs
@@ -5,6 +5,8 @@
 
 use std::process::Command;
 
+use tracing::info;
+
 pub mod arrow;
 mod canonical;
 mod device_buffer;
@@ -36,7 +38,6 @@ use kernel::FoRExecutor;
 pub use kernel::LaunchStrategy;
 use kernel::RunEndExecutor;
 use kernel::SharedExecutor;
-#[cfg(feature = "tracing")]
 pub use kernel::TracingLaunchStrategy;
 use kernel::ZigZagExecutor;
 #[cfg(feature = "unstable_encodings")]
diff --git a/vortex-cuda/src/macros.rs b/vortex-cuda/src/macros.rs
deleted file mode 100644
index e537995cd3e..00000000000
--- a/vortex-cuda/src/macros.rs
+++ /dev/null
@@ -1,42 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright the Vortex contributors
-
-#[macro_export]
-macro_rules! warn {
-    ($($tts:tt)*) => {
-        #[cfg(feature = "tracing")]
-        {
-            tracing::warn!($($tts)*);
-        }
-    };
-}
-
-#[macro_export]
-macro_rules! info {
-    ($($tts:tt)*) => {
-        #[cfg(feature = "tracing")]
-        {
-            tracing::info!($($tts)*);
-        }
-    };
-}
-
-#[macro_export]
-macro_rules! debug {
-    ($($tts:tt)*) => {
-        #[cfg(feature = "tracing")]
-        {
-            tracing::info!($($tts)*);
-        }
-    };
-}
-
-#[macro_export]
-macro_rules! trace {
-    ($($tts:tt)*) => {
-        #[cfg(feature = "tracing")]
-        {
-            tracing::info!($($tts)*);
-        }
-    };
-}
diff --git a/vortex-cuda/src/stream.rs b/vortex-cuda/src/stream.rs
index 449cd6db072..305408c4f6f 100644
--- a/vortex-cuda/src/stream.rs
+++ b/vortex-cuda/src/stream.rs
@@ -14,12 +14,12 @@ use cudarc::driver::result::memcpy_htod_async;
 use cudarc::driver::result::stream;
 use futures::future::BoxFuture;
 use kanal::Sender;
+use tracing::warn;
 use vortex_array::buffer::BufferHandle;
 use vortex_error::VortexResult;
 use vortex_error::vortex_err;
 
 use crate::CudaDeviceBuffer;
-use crate::warn;
 
 #[derive(Clone)]
 pub struct VortexCudaStream(pub Arc<CudaStream>);
@@ -156,7 +156,7 @@ fn register_stream_callback(stream: &CudaStream) -> VortexResult<kanal::AsyncRec
         let tx = unsafe { Box::from_raw(user_data as *mut Sender<()>) };
 
         // Blocking send as we're in a callback invoked by the CUDA driver.
-        // NOTE: send can fail if the CudaEvent is dropped by the caller, in which case the reeciver
+        // NOTE: send can fail if the CudaEvent is dropped by the caller, in which case the receiver
         //  is closed and sends will fail.
         if let Err(_e) = tx.send(()) {
             warn!(error = ?_e, "register_stream_callback send failed due to error");

From ce86532bfdcfd27e144c14d4b7299faefad91f07 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Mon, 16 Feb 2026 14:06:04 -0500
Subject: [PATCH 11/14] save

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 vortex-cuda/src/executor.rs              | 10 ++++++--
 vortex-cuda/src/kernel/encodings/zstd.rs | 30 ++++++++++++------------
 vortex-cuda/src/kernel/mod.rs            | 16 +++++++------
 3 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/vortex-cuda/src/executor.rs b/vortex-cuda/src/executor.rs
index a90f32a6a2c..9e1723dfbb7 100644
--- a/vortex-cuda/src/executor.rs
+++ b/vortex-cuda/src/executor.rs
@@ -94,8 +94,14 @@ impl CudaExecutionCtx {
     ///
     /// We use CUB and NVCOMP routines, and those don't match the normaal `cudarc` entrypoints, so
     /// to inject the configured launch strategy we need to bracket it ourselves.
-    pub fn launch_external<F: FnMut()>(&self, len: usize, function: F) -> VortexResult<()> {
-        self.strategy.with_strategy(&self.stream.0, len, function)
+    pub fn launch_external<F: FnMut() -> VortexResult<()>>(
+        &self,
+        len: usize,
+        function: F,
+    ) -> VortexResult<()> {
+        self.strategy
+            .as_ref()
+            .with_strategy(&self.stream.0, len, function)
     }
 
     /// Launch a Kernel function with args setup done by the provided `build_args` closure.
diff --git a/vortex-cuda/src/kernel/encodings/zstd.rs b/vortex-cuda/src/kernel/encodings/zstd.rs
index e6efdf47c58..ed11bab81a3 100644
--- a/vortex-cuda/src/kernel/encodings/zstd.rs
+++ b/vortex-cuda/src/kernel/encodings/zstd.rs
@@ -13,7 +13,6 @@ use cudarc::driver::DevicePtrMut;
 use futures::future::try_join_all;
 use tracing::debug;
 use tracing::instrument;
-use tracing::trace;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
 use vortex_array::arrays::BinaryView;
@@ -254,22 +253,23 @@ async fn decode_zstd(array: ZstdArray, ctx: &mut CudaExecutionCtx) -> VortexResu
 
     let stream = ctx.stream();
 
-    ctx.launch_external(n_rows, ||
+    ctx.launch_external(n_rows, || {
         // SAFETY: zstd_kernel_prepare makes sure to return valid kernel params.
         unsafe {
-        nvcomp_zstd::decompress_async(
-            exec.frame_ptrs_ptr as _,
-            exec.frame_sizes_ptr as _,
-            exec.output_sizes_ptr as _,
-            exec.device_actual_sizes.device_ptr_mut(stream).0 as _,
-            exec.num_frames,
-            exec.nvcomp_temp_buffer.device_ptr_mut(stream).0 as _,
-            exec.nvcomp_temp_buffer_size,
-            exec.output_ptrs_ptr as _,
-            exec.device_statuses.device_ptr_mut(stream).0 as _,
-            stream.cu_stream().cast(),
-        )
-        .map_err(|e| vortex_err!("nvcomp decompress_async failed: {}", e))?;
+            nvcomp_zstd::decompress_async(
+                exec.frame_ptrs_ptr as _,
+                exec.frame_sizes_ptr as _,
+                exec.output_sizes_ptr as _,
+                exec.device_actual_sizes.device_ptr_mut(stream).0 as _,
+                exec.num_frames,
+                exec.nvcomp_temp_buffer.device_ptr_mut(stream).0 as _,
+                exec.nvcomp_temp_buffer_size,
+                exec.output_ptrs_ptr as _,
+                exec.device_statuses.device_ptr_mut(stream).0 as _,
+                stream.cu_stream().cast(),
+            )
+            .map_err(|e| vortex_err!("nvcomp decompress_async failed: {}", e))
+        }
     })?;
 
     // Unconditionally copy back to the host as Zstd arrays are fully
diff --git a/vortex-cuda/src/kernel/mod.rs b/vortex-cuda/src/kernel/mod.rs
index 8b1633f1d3d..73ad25578c1 100644
--- a/vortex-cuda/src/kernel/mod.rs
+++ b/vortex-cuda/src/kernel/mod.rs
@@ -51,16 +51,18 @@ pub trait LaunchStrategy: Debug + Send + Sync + 'static {
     fn on_complete(&self, events: &CudaKernelEvents, len: usize) -> VortexResult<()>;
 }
 
+/// Extension trait for executing a function which may generate CUDA operations, bracketing them
+/// with CUDA events created using the launch strategy system.
 pub trait LaunchStrategyExt: LaunchStrategy {
-    fn with_strategy<F, R>(&self, stream: &CudaStream, len: usize, func: F) -> VortexResult<R>
+    fn with_strategy<F>(&self, stream: &CudaStream, len: usize, func: F) -> VortexResult<()>
     where
-        F: FnMut() -> R;
+        F: FnMut() -> VortexResult<()>;
 }
 
-impl<S: LaunchStrategyExt> LaunchStrategyExt for S {
-    fn with_strategy<F, R>(&self, stream: &CudaStream, len: usize, func: F) -> VortexResult<R>
+impl<S: LaunchStrategy> LaunchStrategyExt for S {
+    fn with_strategy<F>(&self, stream: &CudaStream, len: usize, mut func: F) -> VortexResult<()>
     where
-        F: FnMut() -> R,
+        F: FnMut() -> VortexResult<()>,
     {
         let flags = self.event_flags();
 
@@ -68,7 +70,7 @@ impl<S: LaunchStrategyExt> LaunchStrategyExt for S {
             .record_event(Some(flags))
             .map_err(|e| vortex_err!("record_event: {e}"))?;
 
-        let result = func();
+        func()?;
 
         let after = stream
             .record_event(Some(flags))
@@ -82,7 +84,7 @@ impl<S: LaunchStrategyExt> LaunchStrategyExt for S {
             len,
         )?;
 
-        Ok(result)
+        Ok(())
     }
 }
 

From 87721c8ca3db349cb4e07d5eed4d10b37543ad93 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Mon, 16 Feb 2026 14:31:11 -0500
Subject: [PATCH 12/14] hardest

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 vortex-cuda/gpu-scan-cli/src/main.rs | 149 ++++++++++++++-------------
 vortex-cuda/src/kernel/mod.rs        |   2 +-
 2 files changed, 76 insertions(+), 75 deletions(-)

diff --git a/vortex-cuda/gpu-scan-cli/src/main.rs b/vortex-cuda/gpu-scan-cli/src/main.rs
index 3a75a4c0d38..87bbbe69f46 100644
--- a/vortex-cuda/gpu-scan-cli/src/main.rs
+++ b/vortex-cuda/gpu-scan-cli/src/main.rs
@@ -17,97 +17,98 @@ use vortex::array::arrays::DictVTable;
 use vortex::error::VortexResult;
 use vortex::file::OpenOptionsSessionExt;
 use vortex::session::VortexSession;
-#[cuda_available]
 use vortex_cuda::CopyDeviceReadAt;
 use vortex_cuda::CudaSession;
 use vortex_cuda::TracingLaunchStrategy;
 use vortex_cuda::VortexCudaStreamPool;
 use vortex_cuda::executor::CudaArrayExt;
 use vortex_cuda_macros::cuda_available;
+use vortex_cuda_macros::cuda_not_available;
+
+#[cuda_not_available]
+fn main() {}
 
+#[cuda_available]
 #[tokio::main]
 async fn main() -> VortexResult<()> {
-    #[cuda_available]
-    {
-        let args: Vec<String> = args().collect();
-        let json_output = args.iter().any(|arg| arg == "--json");
-
-        if json_output {
-            tracing_subscriber::fmt()
-                .json()
-                .with_env_filter(EnvFilter::from_default_env())
-                .with_span_events(FmtSpan::NONE)
-                .with_ansi(false)
-                .init();
-        } else {
-            tracing_subscriber::fmt()
-                .pretty()
-                .with_env_filter(EnvFilter::from_default_env())
-                .with_span_events(FmtSpan::NONE)
-                .with_ansi(false)
-                .event_format(tracing_subscriber::fmt::format().with_target(true))
-                .init();
-        }
+    let args: Vec<String> = args().collect();
+    let json_output = args.iter().any(|arg| arg == "--json");
+
+    if json_output {
+        tracing_subscriber::fmt()
+            .json()
+            .with_env_filter(EnvFilter::from_default_env())
+            .with_span_events(FmtSpan::NONE)
+            .with_ansi(false)
+            .init();
+    } else {
+        tracing_subscriber::fmt()
+            .pretty()
+            .with_env_filter(EnvFilter::from_default_env())
+            .with_span_events(FmtSpan::NONE)
+            .with_ansi(false)
+            .event_format(tracing_subscriber::fmt::format().with_target(true))
+            .init();
+    }
+
+    let session = VortexSession::default();
+    let mut cuda_ctx = CudaSession::create_execution_ctx(&session)?
+        .with_launch_strategy(Arc::new(TracingLaunchStrategy));
+
+    #[allow(clippy::expect_used, clippy::unwrap_in_result)]
+    let input_path = args
+        .iter()
+        .skip(1)
+        .find(|arg| !arg.starts_with("--"))
+        .expect("must provide path to .vortex file");
+    let input_path = PathBuf::from(input_path);
+
+    assert!(input_path.exists(), "input path does not exist");
+
+    let (recompressed, footer) = recompress_for_gpu(input_path, &session).await?;
 
-        let session = VortexSession::default();
-        let mut cuda_ctx = CudaSession::create_execution_ctx(&session)?
-            .with_launch_strategy(Arc::new(TracingLaunchStrategy));
+    // Create a full scan that executes on the GPU
+    let cuda_stream =
+        VortexCudaStreamPool::new(Arc::clone(cuda_ctx.stream().context()), 1).get_stream()?;
+    let gpu_reader = CopyDeviceReadAt::new(recompressed, cuda_stream);
 
-        #[allow(clippy::expect_used, clippy::unwrap_in_result)]
-        let input_path = args
+    let gpu_file = session
+        .open_options()
+        .with_footer(footer)
+        .open(Arc::new(gpu_reader))
+        .await?;
+
+    // execute_micros => µs to execute
+    let mut batches = gpu_file.scan()?.into_array_stream()?;
+
+    let mut chunk = 0;
+    while let Some(next) = batches.next().await.transpose()? {
+        let record = next.to_struct();
+
+        for (field, field_name) in record
+            .unmasked_fields()
             .iter()
-            .skip(1)
-            .find(|arg| !arg.starts_with("--"))
-            .expect("must provide path to .vortex file");
-        let input_path = PathBuf::from(input_path);
-
-        assert!(input_path.exists(), "input path does not exist");
-
-        let (recompressed, footer) = recompress_for_gpu(input_path, &session).await?;
-
-        // Create a full scan that executes on the GPU
-        let cuda_stream =
-            VortexCudaStreamPool::new(Arc::clone(cuda_ctx.stream().context()), 1).get_stream()?;
-        let gpu_reader = CopyDeviceReadAt::new(recompressed, cuda_stream);
-
-        let gpu_file = session
-            .open_options()
-            .with_footer(footer)
-            .open(Arc::new(gpu_reader))
-            .await?;
-
-        // execute_micros => µs to execute
-        let mut batches = gpu_file.scan()?.into_array_stream()?;
-
-        let mut chunk = 0;
-        while let Some(next) = batches.next().await.transpose()? {
-            let record = next.to_struct();
-
-            for (field, field_name) in record
-                .unmasked_fields()
-                .iter()
-                .zip(record.struct_fields().names().iter())
-            {
-                let field_name = field_name.to_string();
-                // skip dict, varbin isn't properly implemented.
-                if field.is::<DictVTable>() {
-                    continue;
-                }
+            .zip(record.struct_fields().names().iter())
+        {
+            let field_name = field_name.to_string();
+            // skip dict, varbin isn't properly implemented.
+            if field.is::<DictVTable>() {
+                continue;
+            }
 
-                let span =
-                    tracing::info_span!("array execution", chunk = chunk, field_name = field_name);
+            let span =
+                tracing::info_span!("array execution", chunk = chunk, field_name = field_name);
 
-                async {
-                    if field.clone().execute_cuda(&mut cuda_ctx).await.is_err() {
-                        tracing::error!("failed to execute_cuda on column");
-                    }
+            async {
+                if field.clone().execute_cuda(&mut cuda_ctx).await.is_err() {
+                    tracing::error!("failed to execute_cuda on column");
                 }
-                .instrument(span)
-                .await;
             }
-
-            chunk += 1;
+            .instrument(span)
+            .await;
         }
+
+        chunk += 1;
     }
 
     Ok(())
diff --git a/vortex-cuda/src/kernel/mod.rs b/vortex-cuda/src/kernel/mod.rs
index 73ad25578c1..02660052eb7 100644
--- a/vortex-cuda/src/kernel/mod.rs
+++ b/vortex-cuda/src/kernel/mod.rs
@@ -59,7 +59,7 @@ pub trait LaunchStrategyExt: LaunchStrategy {
         F: FnMut() -> VortexResult<()>;
 }
 
-impl<S: LaunchStrategy> LaunchStrategyExt for S {
+impl<S: ?Sized + LaunchStrategy> LaunchStrategyExt for S {
     fn with_strategy<F>(&self, stream: &CudaStream, len: usize, mut func: F) -> VortexResult<()>
     where
         F: FnMut() -> VortexResult<()>,

From 7c98499824b963002c0d0887803129d9dddb177a Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Mon, 16 Feb 2026 16:23:03 -0500
Subject: [PATCH 13/14] bracket CUB filters

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 vortex-cuda/src/kernel/filter/mod.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vortex-cuda/src/kernel/filter/mod.rs b/vortex-cuda/src/kernel/filter/mod.rs
index ff080d52aae..962007eafad 100644
--- a/vortex-cuda/src/kernel/filter/mod.rs
+++ b/vortex-cuda/src/kernel/filter/mod.rs
@@ -136,7 +136,7 @@ async fn filter_sized<T: DeviceRepr + CubFilterable + Debug + Send + Sync + 'sta
     let d_num_selected_ptr = d_num_selected.device_ptr_mut(stream).0 as *mut i64;
 
     // CUB uses TransformInputIterator internally to read bits on-the-fly.
-    unsafe {
+    ctx.launch_external(output_len, || unsafe {
         T::filter_bitmask(
             d_temp_ptr,
             temp_bytes,
@@ -148,8 +148,8 @@ async fn filter_sized<T: DeviceRepr + CubFilterable + Debug + Send + Sync + 'sta
             len,
             stream_ptr,
         )
-        .map_err(|e| vortex_err!("CUB filter_bitmask failed: {}", e))?;
-    }
+        .map_err(|e| vortex_err!("CUB filter_bitmask failed: {}", e))
+    })?;
 
     // Wrap the device buffer of outputs back up into a BufferHandle.
     Ok(BufferHandle::new_device(Arc::new(CudaDeviceBuffer::new(

From 3884af8c42926e9868f259687c1ef601098c343d Mon Sep 17 00:00:00 2001
From: Alexander Droste <alexander.droste@protonmail.com>
Date: Tue, 17 Feb 2026 11:00:08 +0000
Subject: [PATCH 14/14] nits

Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
---
 vortex-cuda/benches/bitpacked_cuda.rs       | 2 +-
 vortex-cuda/benches/common/mod.rs           | 8 +-------
 vortex-cuda/benches/date_time_parts_cuda.rs | 2 +-
 vortex-cuda/benches/dict_cuda.rs            | 2 +-
 vortex-cuda/benches/for_cuda.rs             | 4 ++--
 vortex-cuda/benches/runend_cuda.rs          | 2 +-
 vortex-cuda/src/executor.rs                 | 2 +-
 vortex-cuda/src/kernel/arrays/dict.rs       | 3 ---
 8 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/vortex-cuda/benches/bitpacked_cuda.rs b/vortex-cuda/benches/bitpacked_cuda.rs
index fb859607e12..880ce673e6d 100644
--- a/vortex-cuda/benches/bitpacked_cuda.rs
+++ b/vortex-cuda/benches/bitpacked_cuda.rs
@@ -75,7 +75,7 @@ where
         |b, array| {
             b.iter_custom(|iters| {
                 let timed = TimedLaunchStrategy::default();
-                let timer = Arc::clone(timed.get());
+                let timer = Arc::clone(&timed.total_time_ns);
 
                 let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
                     .vortex_expect("failed to create execution context")
diff --git a/vortex-cuda/benches/common/mod.rs b/vortex-cuda/benches/common/mod.rs
index 94273ae599d..578cd398c79 100644
--- a/vortex-cuda/benches/common/mod.rs
+++ b/vortex-cuda/benches/common/mod.rs
@@ -13,13 +13,7 @@ use vortex_error::VortexResult;
 
 #[derive(Debug, Default)]
 pub struct TimedLaunchStrategy {
-    total_time_ns: Arc<AtomicU64>,
-}
-
-impl TimedLaunchStrategy {
-    pub fn get(&self) -> &Arc<AtomicU64> {
-        &self.total_time_ns
-    }
+    pub total_time_ns: Arc<AtomicU64>,
 }
 
 impl LaunchStrategy for TimedLaunchStrategy {
diff --git a/vortex-cuda/benches/date_time_parts_cuda.rs b/vortex-cuda/benches/date_time_parts_cuda.rs
index f378630cd8b..2f01fd8f3b8 100644
--- a/vortex-cuda/benches/date_time_parts_cuda.rs
+++ b/vortex-cuda/benches/date_time_parts_cuda.rs
@@ -68,7 +68,7 @@ fn benchmark_datetimeparts(c: &mut Criterion) {
             |b, dtp_array| {
                 b.iter_custom(|iters| {
                     let timed = TimedLaunchStrategy::default();
-                    let timer = Arc::clone(timed.get());
+                    let timer = Arc::clone(&timed.total_time_ns);
 
                     let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
                         .vortex_expect("failed to create execution context")
diff --git a/vortex-cuda/benches/dict_cuda.rs b/vortex-cuda/benches/dict_cuda.rs
index 5ef61bc7d6c..f71867dcd5b 100644
--- a/vortex-cuda/benches/dict_cuda.rs
+++ b/vortex-cuda/benches/dict_cuda.rs
@@ -93,7 +93,7 @@ where
             |b, dict_array| {
                 b.iter_custom(|iters| {
                     let timed = TimedLaunchStrategy::default();
-                    let timer = Arc::clone(timed.get());
+                    let timer = Arc::clone(&timed.total_time_ns);
 
                     let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
                         .vortex_expect("failed to create execution context")
diff --git a/vortex-cuda/benches/for_cuda.rs b/vortex-cuda/benches/for_cuda.rs
index ce45b3d6041..6de94eae8c7 100644
--- a/vortex-cuda/benches/for_cuda.rs
+++ b/vortex-cuda/benches/for_cuda.rs
@@ -85,7 +85,7 @@ where
             |b, for_array| {
                 b.iter_custom(|iters| {
                     let timed = TimedLaunchStrategy::default();
-                    let timer = Arc::clone(timed.get());
+                    let timer = Arc::clone(&timed.total_time_ns);
 
                     let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
                         .vortex_expect("failed to create execution context")
@@ -123,7 +123,7 @@ where
             |b, for_array| {
                 b.iter_custom(|iters| {
                     let timed = TimedLaunchStrategy::default();
-                    let timer = Arc::clone(timed.get());
+                    let timer = Arc::clone(&timed.total_time_ns);
 
                     let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
                         .vortex_expect("failed to create execution context")
diff --git a/vortex-cuda/benches/runend_cuda.rs b/vortex-cuda/benches/runend_cuda.rs
index 7e5f4e5906d..633d44a3d88 100644
--- a/vortex-cuda/benches/runend_cuda.rs
+++ b/vortex-cuda/benches/runend_cuda.rs
@@ -81,7 +81,7 @@ where
                 |b, runend_array| {
                     b.iter_custom(|iters| {
                         let timed = TimedLaunchStrategy::default();
-                        let timer = Arc::clone(timed.get());
+                        let timer = Arc::clone(&timed.total_time_ns);
 
                         let mut cuda_ctx =
                             CudaSession::create_execution_ctx(&VortexSession::empty())
diff --git a/vortex-cuda/src/executor.rs b/vortex-cuda/src/executor.rs
index 9e1723dfbb7..42475945fc3 100644
--- a/vortex-cuda/src/executor.rs
+++ b/vortex-cuda/src/executor.rs
@@ -92,7 +92,7 @@ impl CudaExecutionCtx {
     /// Perform an external kernel launch, with events created and logged via the configured
     /// [`LaunchStrategy`].
     ///
-    /// We use CUB and NVCOMP routines, and those don't match the normaal `cudarc` entrypoints, so
+    /// We use CUB and NVCOMP routines, and those don't match the normal `cudarc` entrypoints, so
     /// to inject the configured launch strategy we need to bracket it ourselves.
     pub fn launch_external<F: FnMut() -> VortexResult<()>>(
         &self,
diff --git a/vortex-cuda/src/kernel/arrays/dict.rs b/vortex-cuda/src/kernel/arrays/dict.rs
index 2d13c0de216..26ecb17efcc 100644
--- a/vortex-cuda/src/kernel/arrays/dict.rs
+++ b/vortex-cuda/src/kernel/arrays/dict.rs
@@ -178,9 +178,6 @@ async fn execute_dict_decimal_typed<
     assert!(!codes.is_empty());
     let codes_len = codes.len();
     let codes_len_u64 = codes_len as u64;
-    if codes_len == 0 {
-        vortex_bail!("Cannot execute dict on empty codes array");
-    }
 
     let DecimalArrayParts {
         values: values_buffer,