vortex-data · 0ax1 · Feb 16, 2026 · Feb 13, 2026
diff --git a/vortex-cuda/kernels/src/.clang-format → vortex-cuda/.clang-format b/vortex-cuda/kernels/src/.clang-format → vortex-cuda/.clang-format
@@ -14,9 +14,11 @@ DerivePointerAlignment: false
 PointerAlignment: Right
 AlignConsecutiveMacros: true
 AlignTrailingComments: true
-AllowAllArgumentsOnNextLine: true
-AllowAllConstructorInitializersOnNextLine: true
-AllowAllParametersOfDeclarationOnNextLine: true
+AllowAllArgumentsOnNextLine: false
+AllowAllConstructorInitializersOnNextLine: false
+AllowAllParametersOfDeclarationOnNextLine: false
+BinPackArguments: false
+BinPackParameters: false
 AlignAfterOpenBracket: Align
 SpaceBeforeCpp11BracedList: true
 SpaceBeforeCtorInitializerColon: true
@@ -32,4 +34,3 @@ IncludeBlocks: Regroup
 Language: Cpp
 AccessModifierOffset: -4
 ---
-
diff --git a/vortex-cuda/cub/kernels/filter.cu b/vortex-cuda/cub/kernels/filter.cu
@@ -4,10 +4,10 @@
 // CUB DeviceSelect::Flagged wrapper for Vortex GPU filtering.
 
 #include <cub/cub.cuh>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <cuda_runtime.h>
 #include <stdint.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
 
 // i256 type
 typedef struct {
@@ -17,11 +17,10 @@ typedef struct {
 
 // Bit extraction functor for TransformInputIterator
 struct BitExtractor {
-    const uint8_t* packed;
+    const uint8_t *packed;
     uint64_t bit_offset;
 
-    __host__ __device__ inline
-    uint8_t operator()(int64_t idx) const {
+    __host__ __device__ inline uint8_t operator()(int64_t idx) const {
         uint64_t actual_bit = bit_offset + static_cast<uint64_t>(idx);
         uint64_t byte_idx = actual_bit / 8;
         uint32_t bit_idx = actual_bit % 8;
@@ -30,31 +29,27 @@ struct BitExtractor {
 };
 
 /// Type alias for the packed bit iterator.
-using PackedBitIterator = thrust::transform_iterator<
-    BitExtractor,
-    thrust::counting_iterator<int64_t>
->;
+using PackedBitIterator = thrust::transform_iterator<BitExtractor, thrust::counting_iterator<int64_t>>;
 
 // CUB DeviceSelect::Flagged - Query temp storage size
-template<typename T>
-static cudaError_t filter_temp_size_impl(size_t* temp_bytes, int64_t num_items) {
+template <typename T>
+static cudaError_t filter_temp_size_impl(size_t *temp_bytes, int64_t num_items) {
     size_t bytes = 0;
-    cudaError_t err = cub::DeviceSelect::Flagged(
-        nullptr, bytes,
-        static_cast<const T*>(nullptr),
-        static_cast<const uint8_t*>(nullptr),
-        static_cast<T*>(nullptr),
-        static_cast<int64_t*>(nullptr),
-        num_items
-    );
+    cudaError_t err = cub::DeviceSelect::Flagged(nullptr,
+                                                 bytes,
+                                                 static_cast<const T *>(nullptr),
+                                                 static_cast<const uint8_t *>(nullptr),
+                                                 static_cast<T *>(nullptr),
+                                                 static_cast<int64_t *>(nullptr),
+                                                 num_items);
     *temp_bytes = bytes;
     return err;
 }
 
-#define DEFINE_TEMP_SIZE(suffix, Type) \
-extern "C" cudaError_t filter_temp_size_##suffix(size_t* temp_bytes, int64_t n) { \
-    return filter_temp_size_impl<Type>(temp_bytes, n); \
-}
+#define DEFINE_TEMP_SIZE(suffix, Type)                                                                       \
+    extern "C" cudaError_t filter_temp_size_##suffix(size_t *temp_bytes, int64_t n) {                        \
+        return filter_temp_size_impl<Type>(temp_bytes, n);                                                   \
+    }
 
 DEFINE_TEMP_SIZE(u8, uint8_t)
 DEFINE_TEMP_SIZE(i8, int8_t)
@@ -70,33 +65,43 @@ DEFINE_TEMP_SIZE(i128, __int128_t)
 DEFINE_TEMP_SIZE(i256, __int256_t)
 
 // CUB DeviceSelect::Flagged - Execute filter with byte mask (one byte per element)
-template<typename T>
-static cudaError_t filter_bytemask_impl(
-    void* d_temp,
-    size_t temp_bytes,
-    const T* d_in,
-    const uint8_t* d_flags,
-    T* d_out,
-    int64_t* d_num_selected,
-    int64_t num_items,
-    cudaStream_t stream
-) {
-    return cub::DeviceSelect::Flagged(
-        d_temp, temp_bytes,
-        d_in, d_flags, d_out, d_num_selected,
-        num_items, stream
-    );
+template <typename T>
+static cudaError_t filter_bytemask_impl(void *d_temp,
+                                        size_t temp_bytes,
+                                        const T *d_in,
+                                        const uint8_t *d_flags,
+                                        T *d_out,
+                                        int64_t *d_num_selected,
+                                        int64_t num_items,
+                                        cudaStream_t stream) {
+    return cub::DeviceSelect::Flagged(d_temp,
+                                      temp_bytes,
+                                      d_in,
+                                      d_flags,
+                                      d_out,
+                                      d_num_selected,
+                                      num_items,
+                                      stream);
 }
 
-#define DEFINE_FILTER_BYTEMASK(suffix, Type) \
-extern "C" cudaError_t filter_bytemask_##suffix( \
-    void* d_temp, size_t temp_bytes, \
-    const Type* d_in, const uint8_t* d_flags, \
-    Type* d_out, int64_t* d_num_selected, \
-    int64_t num_items, cudaStream_t stream \
-) { \
-    return filter_bytemask_impl<Type>(d_temp, temp_bytes, d_in, d_flags, d_out, d_num_selected, num_items, stream); \
-}
+#define DEFINE_FILTER_BYTEMASK(suffix, Type)                                                                 \
+    extern "C" cudaError_t filter_bytemask_##suffix(void *d_temp,                                            \
+                                                    size_t temp_bytes,                                       \
+                                                    const Type *d_in,                                        \
+                                                    const uint8_t *d_flags,                                  \
+                                                    Type *d_out,                                             \
+                                                    int64_t *d_num_selected,                                 \
+                                                    int64_t num_items,                                       \
+                                                    cudaStream_t stream) {                                   \
+        return filter_bytemask_impl<Type>(d_temp,                                                            \
+                                          temp_bytes,                                                        \
+                                          d_in,                                                              \
+                                          d_flags,                                                           \
+                                          d_out,                                                             \
+                                          d_num_selected,                                                    \
+                                          num_items,                                                         \
+                                          stream);                                                           \
+    }
 
 DEFINE_FILTER_BYTEMASK(u8, uint8_t)
 DEFINE_FILTER_BYTEMASK(i8, int8_t)
@@ -125,41 +130,51 @@ DEFINE_FILTER_BYTEMASK(i256, __int256_t)
 //   d_num_selected: Output count of selected elements
 //   num_items: Number of input elements
 //   stream: CUDA stream
-template<typename T>
-static cudaError_t filter_bitmask_impl(
-    void* d_temp,
-    size_t temp_bytes,
-    const T* d_in,
-    const uint8_t* d_bitmask,
-    uint64_t bit_offset,
-    T* d_out,
-    int64_t* d_num_selected,
-    int64_t num_items,
-    cudaStream_t stream
-) {
+template <typename T>
+static cudaError_t filter_bitmask_impl(void *d_temp,
+                                       size_t temp_bytes,
+                                       const T *d_in,
+                                       const uint8_t *d_bitmask,
+                                       uint64_t bit_offset,
+                                       T *d_out,
+                                       int64_t *d_num_selected,
+                                       int64_t num_items,
+                                       cudaStream_t stream) {
     // Create a transform iterator to read packed bits.
-    BitExtractor extractor{d_bitmask, bit_offset};
+    BitExtractor extractor {d_bitmask, bit_offset};
     thrust::counting_iterator<int64_t> counting_iter(0);
     PackedBitIterator flag_iter(counting_iter, extractor);
 
-    return cub::DeviceSelect::Flagged(
-        d_temp, temp_bytes,
-        d_in, flag_iter, d_out, d_num_selected,
-        num_items, stream
-    );
+    return cub::DeviceSelect::Flagged(d_temp,
+                                      temp_bytes,
+                                      d_in,
+                                      flag_iter,
+                                      d_out,
+                                      d_num_selected,
+                                      num_items,
+                                      stream);
 }
 
-#define DEFINE_FILTER_BITMASK(suffix, Type) \
-extern "C" cudaError_t filter_bitmask_##suffix( \
-    void* d_temp, size_t temp_bytes, \
-    const Type* d_in, \
-    const uint8_t* d_bitmask, \
-    uint64_t bit_offset, \
-    Type* d_out, int64_t* d_num_selected, \
-    int64_t num_items, cudaStream_t stream \
-) { \
-    return filter_bitmask_impl<Type>(d_temp, temp_bytes, d_in, d_bitmask, bit_offset, d_out, d_num_selected, num_items, stream); \
-}
+#define DEFINE_FILTER_BITMASK(suffix, Type)                                                                  \
+    extern "C" cudaError_t filter_bitmask_##suffix(void *d_temp,                                             \
+                                                   size_t temp_bytes,                                        \
+                                                   const Type *d_in,                                         \
+                                                   const uint8_t *d_bitmask,                                 \
+                                                   uint64_t bit_offset,                                      \
+                                                   Type *d_out,                                              \
+                                                   int64_t *d_num_selected,                                  \
+                                                   int64_t num_items,                                        \
+                                                   cudaStream_t stream) {                                    \
+        return filter_bitmask_impl<Type>(d_temp,                                                             \
+                                         temp_bytes,                                                         \
+                                         d_in,                                                               \
+                                         d_bitmask,                                                          \
+                                         bit_offset,                                                         \
+                                         d_out,                                                              \
+                                         d_num_selected,                                                     \
+                                         num_items,                                                          \
+                                         stream);                                                            \
+    }
 
 DEFINE_FILTER_BITMASK(u8, uint8_t)
 DEFINE_FILTER_BITMASK(i8, int8_t)

diff --git a/vortex-cuda/cub/kernels/filter.h b/vortex-cuda/cub/kernels/filter.h
@@ -17,47 +17,45 @@ typedef struct {
 
 // CUDA types - defined as opaque for bindgen
 typedef int cudaError_t;
-typedef void* cudaStream_t;
+typedef void *cudaStream_t;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 // X-macro table: (suffix, c_type)
-#define FILTER_TYPE_TABLE(X)  \
-    X(u8,  uint8_t)           \
-    X(i8,  int8_t)            \
-    X(u16, uint16_t)          \
-    X(i16, int16_t)           \
-    X(u32, uint32_t)          \
-    X(i32, int32_t)           \
-    X(u64, uint64_t)          \
-    X(i64, int64_t)           \
-    X(f32, float)             \
-    X(f64, double)            \
-    X(i128, __int128_t)       \
+#define FILTER_TYPE_TABLE(X)                                                                                 \
+    X(u8, uint8_t)                                                                                           \
+    X(i8, int8_t)                                                                                            \
+    X(u16, uint16_t)                                                                                         \
+    X(i16, int16_t)                                                                                          \
+    X(u32, uint32_t)                                                                                         \
+    X(i32, int32_t)                                                                                          \
+    X(u64, uint64_t)                                                                                         \
+    X(i64, int64_t)                                                                                          \
+    X(f32, float)                                                                                            \
+    X(f64, double)                                                                                           \
+    X(i128, __int128_t)                                                                                      \
     X(i256, __int256_t)
 
 // Filter temp size query functions
-#define DECLARE_FILTER_TEMP_SIZE(suffix, c_type) \
-    cudaError_t filter_temp_size_##suffix(size_t* temp_bytes, int64_t num_items);
+#define DECLARE_FILTER_TEMP_SIZE(suffix, c_type)                                                             \
+    cudaError_t filter_temp_size_##suffix(size_t *temp_bytes, int64_t num_items);
 
 FILTER_TYPE_TABLE(DECLARE_FILTER_TEMP_SIZE)
 
 #undef DECLARE_FILTER_TEMP_SIZE
 
 // Filter execution functions (byte mask - one byte per element)
-#define DECLARE_FILTER_BYTEMASK(suffix, c_type)  \
-    cudaError_t filter_bytemask_##suffix(        \
-        void* d_temp,                            \
-        size_t temp_bytes,                       \
-        const c_type* d_in,                      \
-        const uint8_t* d_flags,                  \
-        c_type* d_out,                           \
-        int64_t* d_num_selected,                 \
-        int64_t num_items,                       \
-        cudaStream_t stream                      \
-    );
+#define DECLARE_FILTER_BYTEMASK(suffix, c_type)                                                              \
+    cudaError_t filter_bytemask_##suffix(void *d_temp,                                                       \
+                                         size_t temp_bytes,                                                  \
+                                         const c_type *d_in,                                                 \
+                                         const uint8_t *d_flags,                                             \
+                                         c_type *d_out,                                                      \
+                                         int64_t *d_num_selected,                                            \
+                                         int64_t num_items,                                                  \
+                                         cudaStream_t stream);
 
 FILTER_TYPE_TABLE(DECLARE_FILTER_BYTEMASK)
 
@@ -68,18 +66,16 @@ FILTER_TYPE_TABLE(DECLARE_FILTER_BYTEMASK)
 // These functions accept packed bit mask directly, avoiding the need to
 // expand bits to bytes in a separate kernel. Uses CUB's TransformInputIterator
 // to read bits on-the-fly during the filter operation.
-#define DECLARE_FILTER_BITMASK(suffix, c_type)  \
-    cudaError_t filter_bitmask_##suffix(        \
-        void* d_temp,                           \
-        size_t temp_bytes,                      \
-        const c_type* d_in,                     \
-        const uint8_t* d_bitmask,               \
-        uint64_t bit_offset,                    \
-        c_type* d_out,                          \
-        int64_t* d_num_selected,                \
-        int64_t num_items,                      \
-        cudaStream_t stream                     \
-    );
+#define DECLARE_FILTER_BITMASK(suffix, c_type)                                                               \
+    cudaError_t filter_bitmask_##suffix(void *d_temp,                                                        \
+                                        size_t temp_bytes,                                                   \
+                                        const c_type *d_in,                                                  \
+                                        const uint8_t *d_bitmask,                                            \
+                                        uint64_t bit_offset,                                                 \
+                                        c_type *d_out,                                                       \
+                                        int64_t *d_num_selected,                                             \
+                                        int64_t num_items,                                                   \
+                                        cudaStream_t stream);
 
 FILTER_TYPE_TABLE(DECLARE_FILTER_BITMASK)
 

diff --git a/vortex-cuda/kernels/src/alp.cu b/vortex-cuda/kernels/src/alp.cu
@@ -7,10 +7,10 @@
 // Converts integers to floats by multiplying by precomputed exponent factors.
 // Formula: decoded = (float)encoded * f * e
 // Where f = F10[exponents.f] and e = IF10[exponents.e] are passed directly.
-template<typename EncodedT, typename FloatT>
+template <typename EncodedT, typename FloatT>
 struct AlpOp {
-    FloatT f;  // F10[exponents.f] - power of 10
-    FloatT e;  // IF10[exponents.e] - inverse power of 10
+    FloatT f; // F10[exponents.f] - power of 10
+    FloatT e; // IF10[exponents.e] - inverse power of 10
 
     __device__ inline FloatT operator()(EncodedT value) const {
         return static_cast<FloatT>(value) * f * e;
@@ -19,16 +19,14 @@ struct AlpOp {
 
 // Macro to generate ALP kernel for each type combination.
 // Input is integer (encoded), output is float (decoded).
-#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncType, FloatType) \
-extern "C" __global__ void alp_##enc_suffix##_##float_suffix( \
-    const EncType *__restrict encoded, \
-    FloatType *__restrict decoded, \
-    FloatType f, \
-    FloatType e, \
-    uint64_t array_len \
-) { \
-    scalar_kernel(encoded, decoded, array_len, AlpOp<EncType, FloatType>{f, e}); \
-}
+#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncType, FloatType)                                    \
+    extern "C" __global__ void alp_##enc_suffix##_##float_suffix(const EncType *__restrict encoded,          \
+                                                                 FloatType *__restrict decoded,              \
+                                                                 FloatType f,                                \
+                                                                 FloatType e,                                \
+                                                                 uint64_t array_len) {                       \
+        scalar_kernel(encoded, decoded, array_len, AlpOp<EncType, FloatType> {f, e});                        \
+    }
 
 // f32 variants (ALP for f32 encodes as i32 or i64)
 GENERATE_ALP_KERNEL(i32, f32, int32_t, float)