vortex-data · 0ax1 · May 29, 2026 · onursatici · May 29, 2026
diff --git a/vortex-cuda/kernels/src/decimal_cast.cu b/vortex-cuda/kernels/src/decimal_cast.cu
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#include "config.cuh"
+#include "types.cuh"
+#include <stdint.h>
+#include <type_traits>
+
+// Arrow decimal schemas fix the physical values buffer width:
+//   - Decimal128: 16 bytes per value.
+//   - Decimal256: 32 bytes per value.
+//
+// Vortex may use narrower decimal storage, so Arrow Device export widens values
+// to match the schema-implied physical layout consumed by cuDF and other Arrow
+// readers.
+// Converts a decimal storage value to Arrow's 128-bit decimal physical representation.
+template <typename Input>
+__device__ __forceinline__ int128_t decimal_to_i128(Input value) {
+    if constexpr (std::is_same_v<Input, int128_t>) {
+        return value;
+    } else if constexpr (std::is_same_v<Input, int256_t>) {
+        return int128_t {value.parts[0], value.parts[1]};
+    } else {
+        const int64_t lo = static_cast<int64_t>(value);
+        const int64_t hi = value < 0 ? -1 : 0;
+        return int128_t {lo, hi};
+    }
+}
+
+// Converts one decimal value to the requested Arrow decimal physical representation.
+template <typename Output, typename Input>
+__device__ __forceinline__ Output decimal_cast_value(Input value) {
+    if constexpr (std::is_same_v<Output, int256_t> && std::is_same_v<Input, int256_t>) {
+        return value;
+    } else if constexpr (std::is_same_v<Output, int128_t>) {
+        return decimal_to_i128(value);
+    } else {
+        static_assert(std::is_same_v<Output, int256_t>);
+        const int128_t value128 = decimal_to_i128(value);
+        const int64_t sign = value128.hi < 0 ? -1 : 0;
+        return int256_t {{value128.lo, value128.hi, sign, sign}};
+    }
+}
+
+// Widens a contiguous decimal values buffer on the device.
+template <typename Input, typename Output>
+__device__ void
+decimal_cast_device(const Input *__restrict input, Output *__restrict output, uint64_t array_len) {
+    const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x;
+    const uint64_t startElem = start_elem(worker, array_len);
+    const uint64_t stopElem = stop_elem(worker, array_len);
+
+    if (startElem >= array_len) {
+        return;
+    }
+
+    for (uint64_t idx = startElem; idx < stopElem; idx++) {
+        output[idx] = decimal_cast_value<Output>(input[idx]);
+    }
+}
+
+// Generates Arrow Decimal128 and Decimal256 widening kernels for one input storage type.
+#define GENERATE_DECIMAL_CAST_KERNELS(input_suffix, InputType)                                               \
+    extern "C" __global__ void decimal_cast_##input_suffix##_i128(const InputType *__restrict input,         \
+                                                                  int128_t *__restrict output,               \
+                                                                  uint64_t array_len) {                      \
+        decimal_cast_device(input, output, array_len);                                                       \
+    }                                                                                                        \
+    extern "C" __global__ void decimal_cast_##input_suffix##_i256(const InputType *__restrict input,         \
+                                                                  int256_t *__restrict output,               \
+                                                                  uint64_t array_len) {                      \
+        decimal_cast_device(input, output, array_len);                                                       \
+    }
+
+FOR_EACH_SIGNED_INT(GENERATE_DECIMAL_CAST_KERNELS)
+FOR_EACH_LARGE_DECIMAL(GENERATE_DECIMAL_CAST_KERNELS)