Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@ DerivePointerAlignment: false
PointerAlignment: Right
AlignConsecutiveMacros: true
AlignTrailingComments: true
AllowAllArgumentsOnNextLine: true
AllowAllConstructorInitializersOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowAllArgumentsOnNextLine: false
AllowAllConstructorInitializersOnNextLine: false
AllowAllParametersOfDeclarationOnNextLine: false
BinPackArguments: false
BinPackParameters: false
AlignAfterOpenBracket: Align
SpaceBeforeCpp11BracedList: true
SpaceBeforeCtorInitializerColon: true
Expand All @@ -32,4 +34,3 @@ IncludeBlocks: Regroup
Language: Cpp
AccessModifierOffset: -4
---

169 changes: 92 additions & 77 deletions vortex-cuda/cub/kernels/filter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
// CUB DeviceSelect::Flagged wrapper for Vortex GPU filtering.

#include <cub/cub.cuh>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <cuda_runtime.h>
#include <stdint.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>

// i256 type
typedef struct {
Expand All @@ -17,11 +17,10 @@ typedef struct {

// Bit extraction functor for TransformInputIterator
struct BitExtractor {
const uint8_t* packed;
const uint8_t *packed;
uint64_t bit_offset;

__host__ __device__ inline
uint8_t operator()(int64_t idx) const {
__host__ __device__ inline uint8_t operator()(int64_t idx) const {
uint64_t actual_bit = bit_offset + static_cast<uint64_t>(idx);
uint64_t byte_idx = actual_bit / 8;
uint32_t bit_idx = actual_bit % 8;
Expand All @@ -30,31 +29,27 @@ struct BitExtractor {
};

/// Type alias for the packed bit iterator.
using PackedBitIterator = thrust::transform_iterator<
BitExtractor,
thrust::counting_iterator<int64_t>
>;
using PackedBitIterator = thrust::transform_iterator<BitExtractor, thrust::counting_iterator<int64_t>>;

// CUB DeviceSelect::Flagged - Query temp storage size
template<typename T>
static cudaError_t filter_temp_size_impl(size_t* temp_bytes, int64_t num_items) {
template <typename T>
static cudaError_t filter_temp_size_impl(size_t *temp_bytes, int64_t num_items) {
size_t bytes = 0;
cudaError_t err = cub::DeviceSelect::Flagged(
nullptr, bytes,
static_cast<const T*>(nullptr),
static_cast<const uint8_t*>(nullptr),
static_cast<T*>(nullptr),
static_cast<int64_t*>(nullptr),
num_items
);
cudaError_t err = cub::DeviceSelect::Flagged(nullptr,
bytes,
static_cast<const T *>(nullptr),
static_cast<const uint8_t *>(nullptr),
static_cast<T *>(nullptr),
static_cast<int64_t *>(nullptr),
num_items);
*temp_bytes = bytes;
return err;
}

#define DEFINE_TEMP_SIZE(suffix, Type) \
extern "C" cudaError_t filter_temp_size_##suffix(size_t* temp_bytes, int64_t n) { \
return filter_temp_size_impl<Type>(temp_bytes, n); \
}
#define DEFINE_TEMP_SIZE(suffix, Type) \
extern "C" cudaError_t filter_temp_size_##suffix(size_t *temp_bytes, int64_t n) { \
return filter_temp_size_impl<Type>(temp_bytes, n); \
}

DEFINE_TEMP_SIZE(u8, uint8_t)
DEFINE_TEMP_SIZE(i8, int8_t)
Expand All @@ -70,33 +65,43 @@ DEFINE_TEMP_SIZE(i128, __int128_t)
DEFINE_TEMP_SIZE(i256, __int256_t)

// CUB DeviceSelect::Flagged - Execute filter with byte mask (one byte per element)
template<typename T>
static cudaError_t filter_bytemask_impl(
void* d_temp,
size_t temp_bytes,
const T* d_in,
const uint8_t* d_flags,
T* d_out,
int64_t* d_num_selected,
int64_t num_items,
cudaStream_t stream
) {
return cub::DeviceSelect::Flagged(
d_temp, temp_bytes,
d_in, d_flags, d_out, d_num_selected,
num_items, stream
);
template <typename T>
static cudaError_t filter_bytemask_impl(void *d_temp,
size_t temp_bytes,
const T *d_in,
const uint8_t *d_flags,
T *d_out,
int64_t *d_num_selected,
int64_t num_items,
cudaStream_t stream) {
return cub::DeviceSelect::Flagged(d_temp,
temp_bytes,
d_in,
d_flags,
d_out,
d_num_selected,
num_items,
stream);
}

#define DEFINE_FILTER_BYTEMASK(suffix, Type) \
extern "C" cudaError_t filter_bytemask_##suffix( \
void* d_temp, size_t temp_bytes, \
const Type* d_in, const uint8_t* d_flags, \
Type* d_out, int64_t* d_num_selected, \
int64_t num_items, cudaStream_t stream \
) { \
return filter_bytemask_impl<Type>(d_temp, temp_bytes, d_in, d_flags, d_out, d_num_selected, num_items, stream); \
}
#define DEFINE_FILTER_BYTEMASK(suffix, Type) \
extern "C" cudaError_t filter_bytemask_##suffix(void *d_temp, \
size_t temp_bytes, \
const Type *d_in, \
const uint8_t *d_flags, \
Type *d_out, \
int64_t *d_num_selected, \
int64_t num_items, \
cudaStream_t stream) { \
return filter_bytemask_impl<Type>(d_temp, \
temp_bytes, \
d_in, \
d_flags, \
d_out, \
d_num_selected, \
num_items, \
stream); \
}

DEFINE_FILTER_BYTEMASK(u8, uint8_t)
DEFINE_FILTER_BYTEMASK(i8, int8_t)
Expand Down Expand Up @@ -125,41 +130,51 @@ DEFINE_FILTER_BYTEMASK(i256, __int256_t)
// d_num_selected: Output count of selected elements
// num_items: Number of input elements
// stream: CUDA stream
template<typename T>
static cudaError_t filter_bitmask_impl(
void* d_temp,
size_t temp_bytes,
const T* d_in,
const uint8_t* d_bitmask,
uint64_t bit_offset,
T* d_out,
int64_t* d_num_selected,
int64_t num_items,
cudaStream_t stream
) {
template <typename T>
static cudaError_t filter_bitmask_impl(void *d_temp,
size_t temp_bytes,
const T *d_in,
const uint8_t *d_bitmask,
uint64_t bit_offset,
T *d_out,
int64_t *d_num_selected,
int64_t num_items,
cudaStream_t stream) {
// Create a transform iterator to read packed bits.
BitExtractor extractor{d_bitmask, bit_offset};
BitExtractor extractor {d_bitmask, bit_offset};
thrust::counting_iterator<int64_t> counting_iter(0);
PackedBitIterator flag_iter(counting_iter, extractor);

return cub::DeviceSelect::Flagged(
d_temp, temp_bytes,
d_in, flag_iter, d_out, d_num_selected,
num_items, stream
);
return cub::DeviceSelect::Flagged(d_temp,
temp_bytes,
d_in,
flag_iter,
d_out,
d_num_selected,
num_items,
stream);
}

#define DEFINE_FILTER_BITMASK(suffix, Type) \
extern "C" cudaError_t filter_bitmask_##suffix( \
void* d_temp, size_t temp_bytes, \
const Type* d_in, \
const uint8_t* d_bitmask, \
uint64_t bit_offset, \
Type* d_out, int64_t* d_num_selected, \
int64_t num_items, cudaStream_t stream \
) { \
return filter_bitmask_impl<Type>(d_temp, temp_bytes, d_in, d_bitmask, bit_offset, d_out, d_num_selected, num_items, stream); \
}
#define DEFINE_FILTER_BITMASK(suffix, Type) \
extern "C" cudaError_t filter_bitmask_##suffix(void *d_temp, \
size_t temp_bytes, \
const Type *d_in, \
const uint8_t *d_bitmask, \
uint64_t bit_offset, \
Type *d_out, \
int64_t *d_num_selected, \
int64_t num_items, \
cudaStream_t stream) { \
return filter_bitmask_impl<Type>(d_temp, \
temp_bytes, \
d_in, \
d_bitmask, \
bit_offset, \
d_out, \
d_num_selected, \
num_items, \
stream); \
}

DEFINE_FILTER_BITMASK(u8, uint8_t)
DEFINE_FILTER_BITMASK(i8, int8_t)
Expand Down
72 changes: 34 additions & 38 deletions vortex-cuda/cub/kernels/filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,47 +17,45 @@ typedef struct {

// CUDA types - defined as opaque for bindgen
typedef int cudaError_t;
typedef void* cudaStream_t;
typedef void *cudaStream_t;

#ifdef __cplusplus
extern "C" {
#endif

// X-macro table: (suffix, c_type)
#define FILTER_TYPE_TABLE(X) \
X(u8, uint8_t) \
X(i8, int8_t) \
X(u16, uint16_t) \
X(i16, int16_t) \
X(u32, uint32_t) \
X(i32, int32_t) \
X(u64, uint64_t) \
X(i64, int64_t) \
X(f32, float) \
X(f64, double) \
X(i128, __int128_t) \
#define FILTER_TYPE_TABLE(X) \
X(u8, uint8_t) \
X(i8, int8_t) \
X(u16, uint16_t) \
X(i16, int16_t) \
X(u32, uint32_t) \
X(i32, int32_t) \
X(u64, uint64_t) \
X(i64, int64_t) \
X(f32, float) \
X(f64, double) \
X(i128, __int128_t) \
X(i256, __int256_t)

// Filter temp size query functions
#define DECLARE_FILTER_TEMP_SIZE(suffix, c_type) \
cudaError_t filter_temp_size_##suffix(size_t* temp_bytes, int64_t num_items);
#define DECLARE_FILTER_TEMP_SIZE(suffix, c_type) \
cudaError_t filter_temp_size_##suffix(size_t *temp_bytes, int64_t num_items);

FILTER_TYPE_TABLE(DECLARE_FILTER_TEMP_SIZE)

#undef DECLARE_FILTER_TEMP_SIZE

// Filter execution functions (byte mask - one byte per element)
#define DECLARE_FILTER_BYTEMASK(suffix, c_type) \
cudaError_t filter_bytemask_##suffix( \
void* d_temp, \
size_t temp_bytes, \
const c_type* d_in, \
const uint8_t* d_flags, \
c_type* d_out, \
int64_t* d_num_selected, \
int64_t num_items, \
cudaStream_t stream \
);
#define DECLARE_FILTER_BYTEMASK(suffix, c_type) \
cudaError_t filter_bytemask_##suffix(void *d_temp, \
size_t temp_bytes, \
const c_type *d_in, \
const uint8_t *d_flags, \
c_type *d_out, \
int64_t *d_num_selected, \
int64_t num_items, \
cudaStream_t stream);

FILTER_TYPE_TABLE(DECLARE_FILTER_BYTEMASK)

Expand All @@ -68,18 +66,16 @@ FILTER_TYPE_TABLE(DECLARE_FILTER_BYTEMASK)
// These functions accept packed bit mask directly, avoiding the need to
// expand bits to bytes in a separate kernel. Uses CUB's TransformInputIterator
// to read bits on-the-fly during the filter operation.
#define DECLARE_FILTER_BITMASK(suffix, c_type) \
cudaError_t filter_bitmask_##suffix( \
void* d_temp, \
size_t temp_bytes, \
const c_type* d_in, \
const uint8_t* d_bitmask, \
uint64_t bit_offset, \
c_type* d_out, \
int64_t* d_num_selected, \
int64_t num_items, \
cudaStream_t stream \
);
#define DECLARE_FILTER_BITMASK(suffix, c_type) \
cudaError_t filter_bitmask_##suffix(void *d_temp, \
size_t temp_bytes, \
const c_type *d_in, \
const uint8_t *d_bitmask, \
uint64_t bit_offset, \
c_type *d_out, \
int64_t *d_num_selected, \
int64_t num_items, \
cudaStream_t stream);

FILTER_TYPE_TABLE(DECLARE_FILTER_BITMASK)

Expand Down
24 changes: 11 additions & 13 deletions vortex-cuda/kernels/src/alp.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
// Converts integers to floats by multiplying by precomputed exponent factors.
// Formula: decoded = (float)encoded * f * e
// Where f = F10[exponents.f] and e = IF10[exponents.e] are passed directly.
template<typename EncodedT, typename FloatT>
template <typename EncodedT, typename FloatT>
struct AlpOp {
FloatT f; // F10[exponents.f] - power of 10
FloatT e; // IF10[exponents.e] - inverse power of 10
FloatT f; // F10[exponents.f] - power of 10
FloatT e; // IF10[exponents.e] - inverse power of 10

__device__ inline FloatT operator()(EncodedT value) const {
return static_cast<FloatT>(value) * f * e;
Expand All @@ -19,16 +19,14 @@ struct AlpOp {

// Macro to generate ALP kernel for each type combination.
// Input is integer (encoded), output is float (decoded).
#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncType, FloatType) \
extern "C" __global__ void alp_##enc_suffix##_##float_suffix( \
const EncType *__restrict encoded, \
FloatType *__restrict decoded, \
FloatType f, \
FloatType e, \
uint64_t array_len \
) { \
scalar_kernel(encoded, decoded, array_len, AlpOp<EncType, FloatType>{f, e}); \
}
#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncType, FloatType) \
extern "C" __global__ void alp_##enc_suffix##_##float_suffix(const EncType *__restrict encoded, \
FloatType *__restrict decoded, \
FloatType f, \
FloatType e, \
uint64_t array_len) { \
scalar_kernel(encoded, decoded, array_len, AlpOp<EncType, FloatType> {f, e}); \
}

// f32 variants (ALP for f32 encodes as i32 or i64)
GENERATE_ALP_KERNEL(i32, f32, int32_t, float)
Expand Down
Loading
Loading