Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ members = [
"vortex-tui",
"vortex-test/e2e",
"vortex-test/e2e-cuda",
"vortex-test/e2e-cuda-scan",
"xtask",
# Encodings
"encodings/fastlanes",
Expand Down
2 changes: 1 addition & 1 deletion _typos.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[default]
extend-ignore-identifiers-re = ["FoR", "typ"]
extend-ignore-identifiers-re = ["ffor", "FFOR", "FoR", "typ"]
# We support a few common special comments to tell the checker to ignore sections of code
extend-ignore-re = [
"(#|//)\\s*spellchecker:ignore-next-line\\n.*", # Ignore the next line
Expand Down
5 changes: 5 additions & 0 deletions vortex-array/src/arrays/primitive/vtable/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,11 @@ impl VTable for PrimitiveVTable {

let ptype = PType::try_from(dtype)?;

vortex_ensure!(
buffer.is_aligned_to(Alignment::new(ptype.byte_width())),
"Misaligned buffer cannot be used to build PrimitiveArray of {ptype}"
);

if buffer.len() != ptype.byte_width() * len {
vortex_bail!(
"Buffer length {} does not match expected length {} for {}, {}",
Expand Down
9 changes: 9 additions & 0 deletions vortex-btrblocks/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,15 @@ impl Default for BtrBlocksCompressorBuilder {
}

impl BtrBlocksCompressorBuilder {
/// Create a new builder with no encodings enabled.
pub fn empty() -> Self {
Self {
int_schemes: Default::default(),
float_schemes: Default::default(),
string_schemes: Default::default(),
}
}

/// Excludes the specified integer compression schemes.
pub fn exclude_int(mut self, codes: impl IntoIterator<Item = IntCode>) -> Self {
let codes: HashSet<_> = codes.into_iter().collect();
Expand Down
8 changes: 7 additions & 1 deletion vortex-cuda/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ workspace = true

[features]
default = []
tracing = ["dep:tracing"]
_test-harness = []
unstable_encodings = ["vortex-zstd/unstable_encodings"]

Expand All @@ -26,12 +27,17 @@ arc-swap = { workspace = true }
arrow-data = { workspace = true, features = ["ffi"] }
arrow-schema = { workspace = true, features = ["ffi"] }
async-trait = { workspace = true }
bytes = { workspace = true }
cudarc = { workspace = true, features = ["f16"] }
fastlanes = { workspace = true }
futures = { workspace = true, features = ["executor"] }
kanal = { workspace = true }
paste = { workspace = true }
tracing = { workspace = true }
tokio = { workspace = true, features = ["fs"] }
tracing = { workspace = true, features = [
"std",
"attributes",
], optional = true }
vortex-alp = { workspace = true }
vortex-array = { workspace = true }
vortex-buffer = { workspace = true }
Expand Down
79 changes: 15 additions & 64 deletions vortex-cuda/benches/bitpacked_cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,25 @@
#![allow(clippy::unwrap_used)]
#![allow(clippy::cast_possible_truncation)]

mod common;

use std::mem::size_of;
use std::ops::Add;
use std::sync::Arc;
use std::sync::atomic::Ordering;
use std::time::Duration;

use criterion::BenchmarkId;
use criterion::Criterion;
use criterion::Throughput;
use cudarc::driver::DeviceRepr;
use cudarc::driver::PushKernelArg;
use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC;
use futures::executor::block_on;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::validity::Validity::NonNullable;
use vortex_buffer::Buffer;
use vortex_cuda::CudaBufferExt;
use vortex_cuda::CudaDeviceBuffer;
use vortex_cuda::CudaExecutionCtx;
use vortex_cuda::BitPackedExecutor;
use vortex_cuda::CudaSession;
use vortex_cuda::bitpacked_cuda_kernel;
use vortex_cuda::bitpacked_cuda_launch_config;
use vortex_cuda::launch_cuda_kernel_with_config;
use vortex_cuda::executor::CudaExecute;
use vortex_cuda_macros::cuda_available;
use vortex_cuda_macros::cuda_not_available;
use vortex_dtype::NativePType;
Expand All @@ -35,6 +33,8 @@ use vortex_fastlanes::BitPackedArray;
use vortex_fastlanes::unpack_iter::BitPacked;
use vortex_session::VortexSession;

use crate::common::TimedLaunchStrategy;

const N_ROWS: usize = 100_000_000;

/// Create a bit-packed array with the given bit width
Expand All @@ -56,54 +56,6 @@ where
.vortex_expect("failed to create BitPacked array")
}

/// Launch the bit unpacking kernel and return elapsed GPU time
fn launch_bitunpack_kernel_timed_typed<T>(
bitpacked_array: &BitPackedArray,
cuda_ctx: &mut CudaExecutionCtx,
) -> vortex_error::VortexResult<Duration>
where
T: BitPacked + DeviceRepr,
T::Physical: DeviceRepr,
{
let packed = bitpacked_array.packed().clone();
let bit_width = bitpacked_array.bit_width();
let len = bitpacked_array.len();

// Move packed data to device if not already there
let device_input = if packed.is_on_device() {
packed
} else {
block_on(cuda_ctx.move_to_device(packed)?).vortex_expect("failed to move to device")
};

// Allocate output buffer
let output_slice = cuda_ctx
.device_alloc::<T>(len.next_multiple_of(1024))
.vortex_expect("failed to allocate output");
let output_buf = CudaDeviceBuffer::new(output_slice);

// Get device views
let input_view = device_input
.cuda_view::<T::Physical>()
.vortex_expect("failed to get input view");
let output_view = output_buf.as_view::<T>();

let output_width = size_of::<T>() * 8;
let cuda_function = bitpacked_cuda_kernel(bit_width, output_width, cuda_ctx)?;
let mut launch_builder = cuda_ctx.launch_builder(&cuda_function);

launch_builder.arg(&input_view);
launch_builder.arg(&output_view);

let config = bitpacked_cuda_launch_config(output_width, len)?;

// Launch kernel
let events =
launch_cuda_kernel_with_config(&mut launch_builder, config, CU_EVENT_BLOCKING_SYNC)?;

events.duration()
}

/// Generic benchmark function for a specific type and bit width
fn benchmark_bitunpack_typed<T>(c: &mut Criterion, bit_width: u8, type_name: &str)
where
Expand All @@ -123,19 +75,18 @@ where
&array,
|b, array| {
b.iter_custom(|iters| {
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
.vortex_expect("failed to create execution context");
let timed = TimedLaunchStrategy::default();
let timer = Arc::clone(timed.get());

let mut total_time = Duration::ZERO;
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
.vortex_expect("failed to create execution context")
.with_launch_strategy(Arc::new(timed));

for _ in 0..iters {
let kernel_time =
launch_bitunpack_kernel_timed_typed::<T>(array, &mut cuda_ctx)
.vortex_expect("kernel launch failed");
total_time += kernel_time;
block_on(BitPackedExecutor.execute(array.to_array(), &mut cuda_ctx)).unwrap();
}

total_time
Duration::from_nanos(timer.load(Ordering::Relaxed))
});
},
);
Expand Down
39 changes: 39 additions & 0 deletions vortex-cuda/benches/common/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

use std::sync::Arc;
use std::sync::atomic::AtomicU64;
use std::sync::atomic::Ordering;

use cudarc::driver::sys::CUevent_flags;
use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC;
use vortex_cuda::CudaKernelEvents;
use vortex_cuda::LaunchStrategy;
use vortex_error::VortexResult;

#[derive(Debug, Default)]
pub struct TimedLaunchStrategy {
total_time_ns: Arc<AtomicU64>,
}

impl TimedLaunchStrategy {
pub fn get(&self) -> &Arc<AtomicU64> {
&self.total_time_ns
}
}

impl LaunchStrategy for TimedLaunchStrategy {
fn event_flags(&self) -> CUevent_flags {
// using blocking_sync to make sure all events flush before we complete.
CU_EVENT_BLOCKING_SYNC
}

fn on_complete(&self, events: &CudaKernelEvents, _len: usize) -> VortexResult<()> {
// NOTE: as long as the duration < 584 years this cast is safe.
let elapsed_nanos = events.duration()?.as_nanos() as u64;
self.total_time_ns
.fetch_add(elapsed_nanos, Ordering::Relaxed);

Ok(())
}
}
Loading
Loading