diff --git a/include/infiniop/ops/dequant/per_tensor_dequant_int8.h b/include/infiniop/ops/dequant/per_tensor_dequant_int8.h new file mode 100644 index 000000000..9614b4303 --- /dev/null +++ b/include/infiniop/ops/dequant/per_tensor_dequant_int8.h @@ -0,0 +1,28 @@ +#ifndef __INFINIOP_PER_TENSOR_DEQUANT_INT8_API_H__ +#define __INFINIOP_PER_TENSOR_DEQUANT_INT8_API_H__ + +#include "../../operator_descriptor.h" + +typedef InfiniopDescriptor *infiniopPerTensorDequantI8Descriptor_t; + +__INFINI_C __export infiniStatus_t infiniopCreatePerTensorDequantI8Descriptor(infiniopHandle_t handle, + infiniopPerTensorDequantI8Descriptor_t *desc_ptr, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t x_packed_desc, + infiniopTensorDescriptor_t x_scale_desc, + infiniopTensorDescriptor_t x_zero_desc); + +__INFINI_C __export infiniStatus_t infiniopGetPerTensorDequantI8WorkspaceSize(infiniopPerTensorDequantI8Descriptor_t desc, size_t *size); + +__INFINI_C __export infiniStatus_t infiniopPerTensorDequantI8(infiniopPerTensorDequantI8Descriptor_t desc, + void *workspace, + size_t workspace_size, + void *x, + const void *x_packed, + const void *x_scale, + const void *x_zero, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDestroyPerTensorDequantI8Descriptor(infiniopPerTensorDequantI8Descriptor_t desc); + +#endif diff --git a/include/infiniop/ops/quant/per_tensor_quant_int8.h b/include/infiniop/ops/quant/per_tensor_quant_int8.h new file mode 100644 index 000000000..16e1c2bc6 --- /dev/null +++ b/include/infiniop/ops/quant/per_tensor_quant_int8.h @@ -0,0 +1,29 @@ +#ifndef __INFINIOP_PER_TENSOR_QUANT_INT8_API_H__ +#define __INFINIOP_PER_TENSOR_QUANT_INT8_API_H__ + +#include "../../operator_descriptor.h" + +typedef InfiniopDescriptor *infiniopPerTensorQuantI8Descriptor_t; + +__INFINI_C __export infiniStatus_t infiniopCreatePerTensorQuantI8Descriptor(infiniopHandle_t handle, + infiniopPerTensorQuantI8Descriptor_t *desc_ptr, + infiniopTensorDescriptor_t x_packed_desc, + infiniopTensorDescriptor_t x_scale_desc, + infiniopTensorDescriptor_t x_zero_desc, + infiniopTensorDescriptor_t x_desc); + +__INFINI_C __export infiniStatus_t infiniopGetPerTensorQuantI8WorkspaceSize(infiniopPerTensorQuantI8Descriptor_t desc, size_t *size); + +__INFINI_C __export infiniStatus_t infiniopPerTensorQuantI8(infiniopPerTensorQuantI8Descriptor_t desc, + void *workspace, + size_t workspace_size, + void *x_packed, + void *x_scale, + void *x_zero, + const void *x, + const bool is_static, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDestroyPerTensorQuantI8Descriptor(infiniopPerTensorQuantI8Descriptor_t desc); + +#endif diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/cuda/kernel.cuh b/src/infiniop/ops/dequant/per_tensor_dequant_int8/cuda/kernel.cuh new file mode 100644 index 000000000..ea1acaa2d --- /dev/null +++ b/src/infiniop/ops/dequant/per_tensor_dequant_int8/cuda/kernel.cuh @@ -0,0 +1,35 @@ +#ifndef __PER_TENSOR_DEQUANT_INT8_KERNEL_CUH__ +#define __PER_TENSOR_DEQUANT_INT8_KERNEL_CUH__ + +template +__device__ void perTensorDequantI8SymKernel( + Tout *x, const Tin *x_packed, const float *x_scale, + size_t batch_size, size_t channel, size_t hidden_dim, size_t width, + ptrdiff_t strides_0, ptrdiff_t strides_1, ptrdiff_t strides_2, ptrdiff_t strides_3, + ptrdiff_t p_strides_0, ptrdiff_t p_strides_1, ptrdiff_t p_strides_2, ptrdiff_t p_strides_3, + int num_elements) { + + unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x; + const int grid_size = blockDim.x * gridDim.x; + float x_scale_val = x_scale[0]; + for (int tid = gid; tid < num_elements; tid += grid_size) { + int w = tid % (int)width; + tid = tid / (int)width; + + int h = tid % (int)hidden_dim; + tid = tid / (int)hidden_dim; + + int c = tid % (int)channel; + tid = tid / (int)channel; + + int b = tid % (int)batch_size; + + int index = w * (int)strides_3 + h * (int)strides_2 + c * (int)strides_1 + b * (int)strides_0; + int p_index = w * (int)p_strides_3 + h * (int)p_strides_2 + c * (int)p_strides_1 + b * (int)p_strides_0; + + float val = static_cast(x_packed[p_index]) * x_scale_val; + x[index] = static_cast(val); + } +} + +#endif // __PER_TENSOR_DEQUANT_INT8_KERNEL_CUH__ diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/info.h b/src/infiniop/ops/dequant/per_tensor_dequant_int8/info.h new file mode 100644 index 000000000..4614f5f40 --- /dev/null +++ b/src/infiniop/ops/dequant/per_tensor_dequant_int8/info.h @@ -0,0 +1,76 @@ +#ifndef __PER_TENSOR_DEQUANT_INT8_INFO_H__ +#define __PER_TENSOR_DEQUANT_INT8_INFO_H__ + +#include "../../../../utils.h" +#include "../../../operator.h" +#include "../../../tensor.h" + +namespace op::per_tensor_dequant_int8 { + +class PerTensorDequantI8Info { +private: + PerTensorDequantI8Info() = default; + +public: + infiniDtype_t dtype, packed_type; + size_t batch_size, channel, hidden_dim, width; + ptrdiff_t strides_0, strides_1, strides_2, strides_3; + ptrdiff_t p_strides_0, p_strides_1, p_strides_2, p_strides_3; + int num_elements; + + static utils::Result createPerTensorDequantI8Info( + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t x_packed_desc, + infiniopTensorDescriptor_t x_scale_desc, + infiniopTensorDescriptor_t x_zero_desc) { + + CHECK_OR_RETURN( + x_packed_desc != nullptr && x_scale_desc != nullptr && x_desc != nullptr, + INFINI_STATUS_NULL_POINTER); + + const infiniDtype_t dtype = x_desc->dtype(); + const infiniDtype_t packed_type = x_packed_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32); + CHECK_DTYPE(packed_type, INFINI_DTYPE_I8); + + auto shape = x_desc->shape(); + CHECK_SAME_SHAPE(shape, x_packed_desc->shape()); + + auto ndim = x_desc->ndim(); + CHECK_OR_RETURN(ndim <= 4, + INFINI_STATUS_BAD_TENSOR_SHAPE); + + size_t width = shape[ndim - 1]; + size_t hidden_dim = (ndim > 1 ? shape[ndim - 2] : 1); + size_t channel = (ndim > 2 ? shape[ndim - 3] : 1); + size_t batch_size = (ndim > 3 ? shape[ndim - 4] : 1); + + ptrdiff_t strides_3 = x_desc->strides()[ndim - 1]; + ptrdiff_t strides_2 = (ndim > 1 ? x_desc->strides()[ndim - 2] : 0); + ptrdiff_t strides_1 = (ndim > 2 ? x_desc->strides()[ndim - 3] : 0); + ptrdiff_t strides_0 = (ndim > 3 ? x_desc->strides()[ndim - 4] : 0); + + ptrdiff_t p_strides_3 = x_packed_desc->strides()[ndim - 1]; + ptrdiff_t p_strides_2 = (ndim > 1 ? x_packed_desc->strides()[ndim - 2] : 0); + ptrdiff_t p_strides_1 = (ndim > 2 ? x_packed_desc->strides()[ndim - 3] : 0); + ptrdiff_t p_strides_0 = (ndim > 3 ? x_packed_desc->strides()[ndim - 4] : 0); + + int num_elements = 1; + for (int i = 0; i < (int)ndim; i++) { + num_elements *= static_cast(shape[i]); + } + + return utils::Result(PerTensorDequantI8Info{ + dtype, + packed_type, + batch_size, channel, hidden_dim, width, + strides_0, strides_1, strides_2, strides_3, + p_strides_0, p_strides_1, p_strides_2, p_strides_3, + num_elements}); + } +}; + +} // namespace op::per_tensor_dequant_int8 + +#endif // __PER_TENSOR_DEQUANT_INT8_INFO_H__ diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/nvidia/per_tensor_dequant_int8_nvidia.cu b/src/infiniop/ops/dequant/per_tensor_dequant_int8/nvidia/per_tensor_dequant_int8_nvidia.cu new file mode 100644 index 000000000..3f62e1b33 --- /dev/null +++ b/src/infiniop/ops/dequant/per_tensor_dequant_int8/nvidia/per_tensor_dequant_int8_nvidia.cu @@ -0,0 +1,114 @@ +#include "../../../../devices/nvidia/nvidia_common.cuh" +#include "per_tensor_dequant_int8_nvidia.cuh" + +#include "../../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../../../../reduce/cuda/reduce.cuh" +#include + +#include "../cuda/kernel.cuh" + +template +INFINIOP_CUDA_KERNEL perTensorDequantI8Sym( + Tout *x, const Tin *x_packed, const float *x_scale, + size_t batch_size, size_t channel, size_t hidden_dim, size_t width, + ptrdiff_t strides_0, ptrdiff_t strides_1, ptrdiff_t strides_2, ptrdiff_t strides_3, + ptrdiff_t p_strides_0, ptrdiff_t p_strides_1, ptrdiff_t p_strides_2, ptrdiff_t p_strides_3, + int num_elements) { + perTensorDequantI8SymKernel(x, x_packed, x_scale, + batch_size, channel, hidden_dim, width, + strides_0, strides_1, strides_2, strides_3, + p_strides_0, p_strides_1, p_strides_2, p_strides_3, + num_elements); +} + +namespace op::per_tensor_dequant_int8::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, Descriptor **desc_ptr, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t x_packed_desc, + infiniopTensorDescriptor_t x_scale_desc, + infiniopTensorDescriptor_t x_zero_desc) { + auto info = PerTensorDequantI8Info::createPerTensorDequantI8Info(x_desc, x_packed_desc, x_scale_desc, x_zero_desc); + CHECK_RESULT(info); + + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t per_tensor_dequant_int8Kernel(const PerTensorDequantI8Info &info, Tdata *x, const int8_t *x_packed, const float *x_scale, const float *x_zero, cudaStream_t stream) { + int num_elements = (int)info.num_elements; + int num_blocks = (num_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; + + size_t batch_size = info.batch_size; + size_t channel = info.channel; + size_t hidden_dim = info.hidden_dim; + size_t width = info.width; + + ptrdiff_t strides_0 = info.strides_0; + ptrdiff_t strides_1 = info.strides_1; + ptrdiff_t strides_2 = info.strides_2; + ptrdiff_t strides_3 = info.strides_3; + + ptrdiff_t p_strides_0 = info.p_strides_0; + ptrdiff_t p_strides_1 = info.p_strides_1; + ptrdiff_t p_strides_2 = info.p_strides_2; + ptrdiff_t p_strides_3 = info.p_strides_3; + + if (x_zero == nullptr) { + perTensorDequantI8Sym + <<>>(x, x_packed, x_scale, + batch_size, channel, hidden_dim, width, + strides_0, strides_1, strides_2, strides_3, + p_strides_0, p_strides_1, p_strides_2, p_strides_3, + num_elements); + } else { + return INFINI_STATUS_BAD_PARAM; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *x, + const void *x_packed, + const void *x_scale, + const void *x_zero, + void *stream_) const { + cudaStream_t stream = (cudaStream_t)stream_; +#define DEQUANT(BLOCK_SIZE, TDATA) \ + per_tensor_dequant_int8Kernel(_info, (TDATA *)x, (const int8_t *)x_packed, (const float *)x_scale, (const float *)x_zero, stream) +#define DEQUANT_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F16) \ + return DEQUANT(BLOCK_SIZE, half); \ + else if (_info.dtype == INFINI_DTYPE_F32) \ + return DEQUANT(BLOCK_SIZE, float); \ + else if (_info.dtype == INFINI_DTYPE_BF16) \ + return DEQUANT(BLOCK_SIZE, __nv_bfloat16); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + DEQUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024) + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + DEQUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512) + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + DEQUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096) + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::per_tensor_dequant_int8::nvidia diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/nvidia/per_tensor_dequant_int8_nvidia.cuh b/src/infiniop/ops/dequant/per_tensor_dequant_int8/nvidia/per_tensor_dequant_int8_nvidia.cuh new file mode 100644 index 000000000..66a7e5d03 --- /dev/null +++ b/src/infiniop/ops/dequant/per_tensor_dequant_int8/nvidia/per_tensor_dequant_int8_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __PER_TENSOR_DEQUANT_INT8_NVIDIA_API_H__ +#define __PER_TENSOR_DEQUANT_INT8_NVIDIA_API_H__ +#include "../per_tensor_dequant_int8.h" + +DESCRIPTOR(nvidia) + +#endif // __PER_TENSOR_DEQUANT_INT8_NVIDIA_API_H__ diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/operator.cc b/src/infiniop/ops/dequant/per_tensor_dequant_int8/operator.cc new file mode 100644 index 000000000..48d416847 --- /dev/null +++ b/src/infiniop/ops/dequant/per_tensor_dequant_int8/operator.cc @@ -0,0 +1,102 @@ +#include "../../../operator.h" +#include "../../../handle.h" +#include "infiniop/ops/dequant/per_tensor_dequant_int8.h" + +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) +#include "nvidia/per_tensor_dequant_int8_nvidia.cuh" +#endif + +__INFINI_C infiniStatus_t infiniopCreatePerTensorDequantI8Descriptor(infiniopHandle_t handle, + infiniopPerTensorDequantI8Descriptor_t *desc_ptr, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t x_packed_desc, + infiniopTensorDescriptor_t x_scale_desc, + infiniopTensorDescriptor_t x_zero_desc) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::per_tensor_dequant_int8::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + x_desc, \ + x_packed_desc, \ + x_scale_desc, \ + x_zero_desc); + switch (handle->device) { +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia) +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CREATE +} + +__INFINI_C infiniStatus_t infiniopGetPerTensorDequantI8WorkspaceSize(infiniopPerTensorDequantI8Descriptor_t desc, size_t *size) { + switch (desc->device_type) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->minWorkspaceSize(); \ + return INFINI_STATUS_SUCCESS; +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +__INFINI_C infiniStatus_t infiniopPerTensorDequantI8(infiniopPerTensorDequantI8Descriptor_t desc, + void *workspace, + size_t workspace_size, + void *x, + const void *x_packed, + const void *x_scale, + const void *x_zero, + void *stream) { +#define DEQUANT(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, x, x_packed, x_scale, x_zero, stream); + + switch (desc->device_type) { +#ifdef ENABLE_NVIDIA_API + DEQUANT(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + DEQUANT(INFINI_DEVICE_QY, nvidia) +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef DEQUANT +} + +__INFINI_C infiniStatus_t infiniopDestroyPerTensorDequantI8Descriptor(infiniopPerTensorDequantI8Descriptor_t desc) { +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_NVIDIA_API + DESTROY(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + DESTROY(INFINI_DEVICE_QY, nvidia) +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef DESTROY +} diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/per_tensor_dequant_int8.h b/src/infiniop/ops/dequant/per_tensor_dequant_int8/per_tensor_dequant_int8.h new file mode 100644 index 000000000..1ed54a8e5 --- /dev/null +++ b/src/infiniop/ops/dequant/per_tensor_dequant_int8/per_tensor_dequant_int8.h @@ -0,0 +1,40 @@ +#ifndef __PER_TENSOR_DEQUANT_I8_H__ +#define __PER_TENSOR_DEQUANT_I8_H__ + +#include "../../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::per_tensor_dequant_int8::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + PerTensorDequantI8Info _info; \ + size_t _workspace_size; \ + \ + Descriptor(Opaque *opaque, PerTensorDequantI8Info info, \ + size_t workspace_size, \ + infiniDevice_t device_type, int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), _info(info), _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t minWorkspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t x_desc, \ + infiniopTensorDescriptor_t x_packed_desc, \ + infiniopTensorDescriptor_t x_scale_desc, \ + infiniopTensorDescriptor_t x_zero_desc); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *x, const void *x_packed, const void *x_scale, const void *x_zero, void *stream) const; \ + }; \ + } + +#endif // __PER_TENSOR_DEQUANT_I8_H__ diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh b/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh index 3c014de9b..91f36e4f9 100644 --- a/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh +++ b/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh @@ -1,5 +1,5 @@ -#ifndef __PERCHANNEL_QUANTINT8_KERNEL_CUH__ -#define __PERCHANNEL_QUANTINT8_KERNEL_CUH__ +#ifndef __PER_CHANNEL_QUANT_INT8_KERNEL_CUH__ +#define __PER_CHANNEL_QUANT_INT8_KERNEL_CUH__ #include __device__ inline int round_half_away_from_zero(float x) { @@ -55,8 +55,8 @@ __device__ void blockPerChannelQuantI8Kernel( float inv_scale = 1.0f / scale; float zero = -global_min * inv_scale - 128.0f; - x_scale[row] = (Tdata)scale; - x_zero[row] = (Tdata)zero; + x_scale[row] = scale; + x_zero[row] = zero; for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) { @@ -111,7 +111,7 @@ __device__ void blockPerChannelQuantI8SymKernel( float inv_scale = 1.0f / scale; - x_scale[row] = (Tdata)scale; + x_scale[row] = scale; for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) { @@ -270,4 +270,4 @@ __device__ void warpPerChannelQuantI8SymKernel( } } -#endif // __PERCHANNEL_QUANTINT8_KERNEL_CUH__ +#endif // __PER_CHANNEL_QUANT_INT8_KERNEL_CUH__ diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h b/src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h index 4d1675c8c..4f1d3be2c 100644 --- a/src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h +++ b/src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h @@ -1,5 +1,5 @@ -#ifndef __QUANT_H__ -#define __QUANT_H__ +#ifndef __PER_CHANNEL_QUANT_INT8_H__ +#define __PER_CHANNEL_QUANT_INT8_H__ #include "../../../operator.h" #include "info.h" @@ -37,4 +37,4 @@ }; \ } -#endif // __QUANT_H__ +#endif // __PER_CHANNEL_QUANT_INT8_H__ diff --git a/src/infiniop/ops/quant/per_tensor_quant_int8/cuda/kernel.cuh b/src/infiniop/ops/quant/per_tensor_quant_int8/cuda/kernel.cuh new file mode 100644 index 000000000..bb2a8d304 --- /dev/null +++ b/src/infiniop/ops/quant/per_tensor_quant_int8/cuda/kernel.cuh @@ -0,0 +1,130 @@ +#ifndef __PER_TENSOR_QUANT_INT8_KERNEL_CUH__ +#define __PER_TENSOR_QUANT_INT8_KERNEL_CUH__ + +#include + +#ifndef WARP_SIZE +#define WARP_SIZE 32 +#endif + +#define FULL_MASK 0xffffffff + + +// warp reduce max +__device__ __forceinline__ float warpReduceMax(float val) +{ + for (int offset = WARP_SIZE/2; offset > 0; offset /= 2) + val = fmaxf(val, __shfl_xor_sync(FULL_MASK, val, offset)); + return val; +} + + +// float atomic max (safe version) +__device__ __forceinline__ void atomicMaxFloat(float* addr, float val) +{ + int* addr_i = (int*)addr; + int old = *addr_i; + int assumed; + + do + { + assumed = old; + float old_f = __int_as_float(assumed); + float new_f = fmaxf(val, old_f); + + old = atomicCAS(addr_i, assumed, __float_as_int(new_f)); + + } while (assumed != old); +} + +__device__ inline int round_half_away_from_zero(float x) { + float ax = fabsf(x); + float r = floorf(ax + 0.5f); + return (x >= 0.0f) ? (int)r : -(int)r; +} + +template +__device__ void perTensorAbsmaxSymKernel(float *x_scale, const Tdata *x, + size_t batch_size, size_t channel, size_t hidden_dim, size_t width, + ptrdiff_t strides_0, ptrdiff_t strides_1, ptrdiff_t strides_2, ptrdiff_t strides_3, + int num_elements) { + int tid = threadIdx.x; + int gid = blockIdx.x * blockDim.x + tid; + int grid_size = blockDim.x * gridDim.x; + + float local_max = 0.f; + + // grid-stride loop + for (int tid = gid; tid < num_elements; tid += grid_size) + { + int w = tid % (int)width; + tid = tid / (int)width; + + int h = tid % (int)hidden_dim; + tid = tid / (int)hidden_dim; + + int c = tid % (int)channel; + tid = tid / (int)channel; + + int b = tid % (int)batch_size; + + int index = w * (int)strides_3 + h * (int)strides_2 + c * (int)strides_1 + b * (int)strides_0; + + float v = fabsf((float)x[index]); + + local_max = fmaxf(local_max, v); + } + + // warp reduction + local_max = warpReduceMax(local_max); + // 每个 warp 只 atomic 一次 + if ((tid & (WARP_SIZE - 1)) == 0) + { + atomicMaxFloat(x_scale, local_max / 127.0f); + } + +} + +template +__device__ void perTensorQuantI8SymKernel( + int8_t *x_packed, float *x_scale, const Tdata *x, + size_t batch_size, size_t channel, size_t hidden_dim, size_t width, + ptrdiff_t strides_0, ptrdiff_t strides_1, ptrdiff_t strides_2, ptrdiff_t strides_3, + ptrdiff_t p_strides_0, ptrdiff_t p_strides_1, ptrdiff_t p_strides_2, ptrdiff_t p_strides_3, + int num_elements) { + + unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x; + const int grid_size = blockDim.x * gridDim.x; + + float scale_val = 1.0f / x_scale[0]; + + for (int tid = gid; tid < num_elements; tid += grid_size) { + int w = tid % (int)width; + tid = tid / (int)width; + + int h = tid % (int)hidden_dim; + tid = tid / (int)hidden_dim; + + int c = tid % (int)channel; + tid = tid / (int)channel; + + int b = tid % (int)batch_size; + + int index = w * (int)strides_3 + h * (int)strides_2 + c * (int)strides_1 + b * (int)strides_0; + int p_index = w * (int)p_strides_3 + h * (int)p_strides_2 + c * (int)p_strides_1 + b * (int)p_strides_0; + + float qf = (float)x[index] * scale_val; + int q = round_half_away_from_zero(qf); + + if (q > 127) { + q = 127; + } + if (q < -127) { + q = -127; + } + + x_packed[p_index] = (int8_t)q; + } +} + +#endif // __PER_TENSOR_QUANT_INT8_KERNEL_CUH__ diff --git a/src/infiniop/ops/quant/per_tensor_quant_int8/info.h b/src/infiniop/ops/quant/per_tensor_quant_int8/info.h new file mode 100644 index 000000000..f6ed0d067 --- /dev/null +++ b/src/infiniop/ops/quant/per_tensor_quant_int8/info.h @@ -0,0 +1,77 @@ +#ifndef __PER_TENSOR_QUANT_INT8_INFO_H__ +#define __PER_TENSOR_QUANT_INT8_INFO_H__ + +#include "../../../../utils.h" +#include "../../../operator.h" +#include "../../../tensor.h" + +namespace op::per_tensor_quant_int8 { + +class PerTensorQuantI8Info { +private: + PerTensorQuantI8Info() = default; + +public: + infiniDtype_t dtype, packed_type; + size_t batch_size, channel, hidden_dim, width; + ptrdiff_t strides_0, strides_1, strides_2, strides_3; + ptrdiff_t p_strides_0, p_strides_1, p_strides_2, p_strides_3; + int num_elements; + bool is_static; + + static utils::Result createPerTensorQuantI8Info( + infiniopTensorDescriptor_t x_packed_desc, + infiniopTensorDescriptor_t x_scale_desc, + infiniopTensorDescriptor_t x_zero_desc, + infiniopTensorDescriptor_t x_desc) { + + CHECK_OR_RETURN( + x_packed_desc != nullptr && x_scale_desc != nullptr && x_desc != nullptr, + INFINI_STATUS_NULL_POINTER); + + const infiniDtype_t dtype = x_desc->dtype(); + const infiniDtype_t packed_type = x_packed_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32); + CHECK_DTYPE(packed_type, INFINI_DTYPE_I8); + + auto shape = x_desc->shape(); + CHECK_SAME_SHAPE(shape, x_packed_desc->shape()); + + auto ndim = x_desc->ndim(); + CHECK_OR_RETURN(ndim <= 4, + INFINI_STATUS_BAD_TENSOR_SHAPE); + + size_t width = shape[ndim - 1]; + size_t hidden_dim = (ndim > 1 ? shape[ndim - 2] : 1); + size_t channel = (ndim > 2 ? shape[ndim - 3] : 1); + size_t batch_size = (ndim > 3 ? shape[ndim - 4] : 1); + + ptrdiff_t strides_3 = x_desc->strides()[ndim - 1]; + ptrdiff_t strides_2 = (ndim > 1 ? x_desc->strides()[ndim - 2] : 0); + ptrdiff_t strides_1 = (ndim > 2 ? x_desc->strides()[ndim - 3] : 0); + ptrdiff_t strides_0 = (ndim > 3 ? x_desc->strides()[ndim - 4] : 0); + + ptrdiff_t p_strides_3 = x_packed_desc->strides()[ndim - 1]; + ptrdiff_t p_strides_2 = (ndim > 1 ? x_packed_desc->strides()[ndim - 2] : 0); + ptrdiff_t p_strides_1 = (ndim > 2 ? x_packed_desc->strides()[ndim - 3] : 0); + ptrdiff_t p_strides_0 = (ndim > 3 ? x_packed_desc->strides()[ndim - 4] : 0); + + int num_elements = 1; + for (int i = 0; i < (int)ndim; i++) { + num_elements *= static_cast(shape[i]); + } + + return utils::Result(PerTensorQuantI8Info{ + dtype, + packed_type, + batch_size, channel, hidden_dim, width, + strides_0, strides_1, strides_2, strides_3, + p_strides_0, p_strides_1, p_strides_2, p_strides_3, + num_elements}); + } +}; + +} // namespace op::per_tensor_quant_int8 + +#endif // __PER_TENSOR_QUANT_INT8_INFO_H__ diff --git a/src/infiniop/ops/quant/per_tensor_quant_int8/nvidia/per_tensor_quant_int8_nvidia.cu b/src/infiniop/ops/quant/per_tensor_quant_int8/nvidia/per_tensor_quant_int8_nvidia.cu new file mode 100644 index 000000000..a4aa6813d --- /dev/null +++ b/src/infiniop/ops/quant/per_tensor_quant_int8/nvidia/per_tensor_quant_int8_nvidia.cu @@ -0,0 +1,130 @@ +#include "../../../../devices/nvidia/nvidia_common.cuh" +#include "per_tensor_quant_int8_nvidia.cuh" + +#include "../../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../../../../reduce/cuda/reduce.cuh" +#include + +#include "../cuda/kernel.cuh" + +template +INFINIOP_CUDA_KERNEL perTensorAbsmaxSym( + float *x_scale, const Tdata *x, + size_t batch_size, size_t channel, size_t hidden_dim, size_t width, + ptrdiff_t strides_0, ptrdiff_t strides_1, ptrdiff_t strides_2, ptrdiff_t strides_3, + int num_elements) { + perTensorAbsmaxSymKernel(x_scale, x, + batch_size, channel, hidden_dim, width, + strides_0, strides_1, strides_2, strides_3, + num_elements); +} + +template +INFINIOP_CUDA_KERNEL perTensorQuantI8Sym( + int8_t *x_packed, float *x_scale, const Tdata *x, + size_t batch_size, size_t channel, size_t hidden_dim, size_t width, + ptrdiff_t strides_0, ptrdiff_t strides_1, ptrdiff_t strides_2, ptrdiff_t strides_3, + ptrdiff_t p_strides_0, ptrdiff_t p_strides_1, ptrdiff_t p_strides_2, ptrdiff_t p_strides_3, + int num_elements) { + perTensorQuantI8SymKernel(x_packed, x_scale, x, + batch_size, channel, hidden_dim, width, + strides_0, strides_1, strides_2, strides_3, + p_strides_0, p_strides_1, p_strides_2, p_strides_3, + num_elements); +} + +namespace op::per_tensor_quant_int8::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, Descriptor **desc_ptr, + infiniopTensorDescriptor_t x_packed_desc, + infiniopTensorDescriptor_t x_scale_desc, + infiniopTensorDescriptor_t x_zero_desc, + infiniopTensorDescriptor_t x_desc) { + auto info = PerTensorQuantI8Info::createPerTensorQuantI8Info(x_packed_desc, x_scale_desc, x_zero_desc, x_desc); + CHECK_RESULT(info); + + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t per_tensor_quant_int8Kernel(const PerTensorQuantI8Info &info, int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, const bool is_static, cudaStream_t stream) { + int num_elements = (int)info.num_elements; + int num_blocks = (num_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; + + size_t batch_size = info.batch_size; + size_t channel = info.channel; + size_t hidden_dim = info.hidden_dim; + size_t width = info.width; + + ptrdiff_t strides_0 = info.strides_0; + ptrdiff_t strides_1 = info.strides_1; + ptrdiff_t strides_2 = info.strides_2; + ptrdiff_t strides_3 = info.strides_3; + + ptrdiff_t p_strides_0 = info.p_strides_0; + ptrdiff_t p_strides_1 = info.p_strides_1; + ptrdiff_t p_strides_2 = info.p_strides_2; + ptrdiff_t p_strides_3 = info.p_strides_3; + + if (x_zero == nullptr) { + if (is_static == false) { + perTensorAbsmaxSym + <<>>(x_scale, x, + batch_size, channel, hidden_dim, width, + strides_0, strides_1, strides_2, strides_3, + num_elements); + } + perTensorQuantI8Sym + <<>>(x_packed, x_scale, x, + batch_size, channel, hidden_dim, width, + strides_0, strides_1, strides_2, strides_3, + p_strides_0, p_strides_1, p_strides_2, p_strides_3, + num_elements); + } else { + return INFINI_STATUS_BAD_PARAM; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *x_packed, void *x_scale, void *x_zero, const void *x, const bool is_static, + void *stream_) const { + cudaStream_t stream = (cudaStream_t)stream_; +#define QUANT(BLOCK_SIZE, TDATA) \ + per_tensor_quant_int8Kernel(_info, (int8_t *)x_packed, (float *)x_scale, (float *)x_zero, (const TDATA *)x, is_static, stream) +#define QUANT_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F16) \ + return QUANT(BLOCK_SIZE, half); \ + else if (_info.dtype == INFINI_DTYPE_F32) \ + return QUANT(BLOCK_SIZE, float); \ + else if (_info.dtype == INFINI_DTYPE_BF16) \ + return QUANT(BLOCK_SIZE, __nv_bfloat16); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + QUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024) + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + QUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512) + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + QUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096) + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::per_tensor_quant_int8::nvidia diff --git a/src/infiniop/ops/quant/per_tensor_quant_int8/nvidia/per_tensor_quant_int8_nvidia.cuh b/src/infiniop/ops/quant/per_tensor_quant_int8/nvidia/per_tensor_quant_int8_nvidia.cuh new file mode 100644 index 000000000..4137c2d47 --- /dev/null +++ b/src/infiniop/ops/quant/per_tensor_quant_int8/nvidia/per_tensor_quant_int8_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __PER_TENSOR_QUANT_INT8_NVIDIA_API_H__ +#define __PER_TENSOR_QUANT_INT8_NVIDIA_API_H__ +#include "../per_tensor_quant_int8.h" + +DESCRIPTOR(nvidia) + +#endif // __PER_TENSOR_QUANT_INT8_NVIDIA_API_H__ diff --git a/src/infiniop/ops/quant/per_tensor_quant_int8/operator.cc b/src/infiniop/ops/quant/per_tensor_quant_int8/operator.cc new file mode 100644 index 000000000..364fbe44b --- /dev/null +++ b/src/infiniop/ops/quant/per_tensor_quant_int8/operator.cc @@ -0,0 +1,103 @@ +#include "../../../operator.h" +#include "../../../handle.h" +#include "infiniop/ops/quant/per_tensor_quant_int8.h" + +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) +#include "nvidia/per_tensor_quant_int8_nvidia.cuh" +#endif + +__INFINI_C infiniStatus_t infiniopCreatePerTensorQuantI8Descriptor(infiniopHandle_t handle, + infiniopPerTensorQuantI8Descriptor_t *desc_ptr, + infiniopTensorDescriptor_t x_packed_desc, + infiniopTensorDescriptor_t x_scale_desc, + infiniopTensorDescriptor_t x_zero_desc, + infiniopTensorDescriptor_t x_desc) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::per_tensor_quant_int8::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + x_packed_desc, \ + x_scale_desc, \ + x_zero_desc, \ + x_desc); + switch (handle->device) { +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia) +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CREATE +} + +__INFINI_C infiniStatus_t infiniopGetPerTensorQuantI8WorkspaceSize(infiniopPerTensorQuantI8Descriptor_t desc, size_t *size) { + switch (desc->device_type) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->minWorkspaceSize(); \ + return INFINI_STATUS_SUCCESS; +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +__INFINI_C infiniStatus_t infiniopPerTensorQuantI8(infiniopPerTensorQuantI8Descriptor_t desc, + void *workspace, + size_t workspace_size, + void *x_packed, + void *x_scale, + void *x_zero, + const void *x, + const bool is_static, + void *stream) { +#define QUANT(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, x_packed, x_scale, x_zero, x, is_static, stream); + + switch (desc->device_type) { +#ifdef ENABLE_NVIDIA_API + QUANT(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + QUANT(INFINI_DEVICE_QY, nvidia) +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef QUANT +} + +__INFINI_C infiniStatus_t infiniopDestroyPerTensorQuantI8Descriptor(infiniopPerTensorQuantI8Descriptor_t desc) { +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_NVIDIA_API + DESTROY(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + DESTROY(INFINI_DEVICE_QY, nvidia) +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef DESTROY +} diff --git a/src/infiniop/ops/quant/per_tensor_quant_int8/per_tensor_quant_int8.h b/src/infiniop/ops/quant/per_tensor_quant_int8/per_tensor_quant_int8.h new file mode 100644 index 000000000..f75b91173 --- /dev/null +++ b/src/infiniop/ops/quant/per_tensor_quant_int8/per_tensor_quant_int8.h @@ -0,0 +1,40 @@ +#ifndef __PER_TENSOR_QUANT_I8_H__ +#define __PER_TENSOR_QUANT_I8_H__ + +#include "../../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::per_tensor_quant_int8::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + PerTensorQuantI8Info _info; \ + size_t _workspace_size; \ + \ + Descriptor(Opaque *opaque, PerTensorQuantI8Info info, \ + size_t workspace_size, \ + infiniDevice_t device_type, int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), _info(info), _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t minWorkspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t x_packed_desc, \ + infiniopTensorDescriptor_t x_scale_desc, \ + infiniopTensorDescriptor_t x_zero_desc, \ + infiniopTensorDescriptor_t x_desc); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *x_packed, void *x_scale, void *x_zero, const void *x, const bool is_static, void *stream) const; \ + }; \ + } + +#endif // __PER_TENSOR_QUANT_I8_H__ diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index 275689e78..015daeca4 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -4,7 +4,7 @@ infiniopOperatorDescriptor_t, ) -from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float +from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float, c_bool class OpRegister: @@ -760,6 +760,79 @@ def per_channel_quant_int8_(lib): infiniopOperatorDescriptor_t, ] + +@OpRegister.operator +def per_tensor_quant_int8_(lib): + lib.infiniopCreatePerTensorQuantI8Descriptor.restype = c_int32 + lib.infiniopCreatePerTensorQuantI8Descriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetPerTensorQuantI8WorkspaceSize.restype = c_int32 + lib.infiniopGetPerTensorQuantI8WorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopPerTensorQuantI8.restype = c_int32 + lib.infiniopPerTensorQuantI8.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_bool, + c_void_p, + ] + + lib.infiniopDestroyPerTensorQuantI8Descriptor.restype = c_int32 + lib.infiniopDestroyPerTensorQuantI8Descriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def per_tensor_dequant_int8_(lib): + lib.infiniopCreatePerTensorDequantI8Descriptor.restype = c_int32 + lib.infiniopCreatePerTensorDequantI8Descriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetPerTensorDequantI8WorkspaceSize.restype = c_int32 + lib.infiniopGetPerTensorDequantI8WorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopPerTensorDequantI8.restype = c_int32 + lib.infiniopPerTensorDequantI8.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyPerTensorDequantI8Descriptor.restype = c_int32 + lib.infiniopDestroyPerTensorDequantI8Descriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + @OpRegister.operator def softplus_(lib): lib.infiniopCreateSoftplusDescriptor.restype = c_int32 diff --git a/test/infiniop/per_channel_quant_int8.py b/test/infiniop/per_channel_quant_int8.py index dcbf9d1f8..f175419d9 100644 --- a/test/infiniop/per_channel_quant_int8.py +++ b/test/infiniop/per_channel_quant_int8.py @@ -24,7 +24,7 @@ # ============================================================================== # These are not meant to be imported from other modules _TEST_CASES = [ - # x_shape, w_shape, symmetric, bias_exit, y_shape + # x_shape, symmetric ((8, 8), True), ((128, 512), True), ((128, 128), True), @@ -151,16 +151,16 @@ def lib_per_channel_quant_int8(): atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) if DEBUG: - debug(x_packed.actual_tensor(), x_p, atol=atol, rtol=rtol) + debug(x_packed.actual_tensor(), x_p, atol=2, rtol=0) debug(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol) if symmetric == False: debug(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol) - + if symmetric: - assert (torch.allclose(x_packed.actual_tensor(), x_p, atol=2, rtol=2) and + assert (torch.allclose(x_packed.actual_tensor(), x_p, atol=2, rtol=0) and torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol)) else: - assert (torch.allclose(x_packed.actual_tensor(), x_p, atol=2, rtol=2) and + assert (torch.allclose(x_packed.actual_tensor(), x_p, atol=2, rtol=0) and torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol) and torch.allclose(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol)) diff --git a/test/infiniop/per_tensor_dequant_int8.py b/test/infiniop/per_tensor_dequant_int8.py new file mode 100644 index 000000000..251bc25a1 --- /dev/null +++ b/test/infiniop/per_tensor_dequant_int8.py @@ -0,0 +1,175 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES = [ + # x_shape, x_stride, x_packed_stride, symmetric + ((16, 5632), None, None, True), + ((13, 4), (10, 1), None, True), + ((13, 4), (10, 1), (10, 1), True), + ((16, 5632), (13312, 1), (13312, 1), True), + ((4, 4, 5632), None, None, True), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), True), + ((1, 1, 8, 1), None, None, True), + ((1, 8, 32, 32), None, None, True), + ((8, 16, 64, 128), (8388608, 524288, 8192, 1), None, True), + ((1, 2, 2304, 128), (589824, 294912, 128, 1), (589824, 294912, 128, 1), True), +] + + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 5e-2}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 5e-2}, + InfiniDtype.F32: {"atol": 3e-5, "rtol": 5e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def per_tensor_dequant_int8_torch(x_packed, x_scale, dtype): + fake_qweight = x_packed.to(dtype) + dq_weight = fake_qweight * x_scale + return dq_weight + + +def test( + handle, + device, + x_shape, + x_stride, + x_packed_stride, + symmetric, + dtype=InfiniDtype.F16, + sync=None, +): + if symmetric == False: + return + print( + f"Testing Per Tensor Dequant Int8 on {InfiniDeviceNames[device]} with x_shape:{x_shape}, x_stride:{x_stride}, x_packed_stride:{x_packed_stride}, symmetric:{symmetric} , dtype:{InfiniDtypeNames[dtype]}" + ) + + x = TestTensor(x_shape, x_stride, dtype, device) + + x_packed = TestTensor( + x_shape, + x_packed_stride, + InfiniDtype.I8, + device, + randint_low=-127, + randint_high=127, + ) + x_scale = TestTensor((1,), None, InfiniDtype.F32, device) + if symmetric: + x_zero = None + else: + x_zero = TestTensor((1,), None, InfiniDtype.F32, device) + + ans = per_tensor_dequant_int8_torch( + x_packed.torch_tensor(), x_scale.torch_tensor(), x.torch_tensor().dtype + ) + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreatePerTensorDequantI8Descriptor( + handle, + ctypes.byref(descriptor), + x.descriptor, + x_packed.descriptor, + x_scale.descriptor, + None if symmetric else x_zero.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + + x_packed.destroy_desc() + x_scale.destroy_desc() + if symmetric == False: + x_zero.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetPerTensorDequantI8WorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, x.device) + + def lib_per_tensor_dequant_int8(): + check_error( + LIBINFINIOP.infiniopPerTensorDequantI8( + descriptor, + workspace.data(), + workspace_size.value, + x.data(), + x_packed.data(), + x_scale.data(), + None if symmetric else x_zero.data(), + None, + ) + ) + + lib_per_tensor_dequant_int8() + + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(x.actual_tensor().float(), ans.float(), atol=atol, rtol=rtol) + + assert torch.allclose(x.actual_tensor().float(), ans.float(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: per_tensor_dequant_int8_torch(x_packed.torch_tensor(), x_scale.torch_tensor(), x.torch_tensor().dtype), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_per_tensor_dequant_int8(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyPerTensorDequantI8Descriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/per_tensor_quant_int8.py b/test/infiniop/per_tensor_quant_int8.py new file mode 100644 index 000000000..fba4e152d --- /dev/null +++ b/test/infiniop/per_tensor_quant_int8.py @@ -0,0 +1,201 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES = [ + # x_shape, x_stride, x_packed_stride, symmetric, is_static + ((16, 5632), None, None, True, False), + ((13, 4), (10, 1), None, True, True), + ((13, 4), (10, 1), (10, 1), True, False), + ((16, 5632), (13312, 1), (13312, 1), True, True), + ((4, 4, 5632), None, None, True, False), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), True, True), + ((1, 1, 8, 1), None, None, True, False), + ((1, 8, 32, 32), None, None, True, True), + ((8, 16, 64, 128), (8388608, 524288, 8192, 1), None, True, False), + ((1, 2, 2304, 128), (589824, 294912, 128, 1), (589824, 294912, 128, 1), True, True), +] + + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 5e-2}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 5e-2}, + InfiniDtype.F32: {"atol": 3e-5, "rtol": 5e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def per_tensor_quant_int8_torch(x, x_scale, symmetric, is_static): + if symmetric == False: + return + else: + x = x.float() + if is_static: + x_q = x.mul(1 / x_scale) + x_q = torch.round(x_q).to(torch.int8) + return x_q, x_scale, None + else: + absmax = x.flatten().abs().max() + if absmax == 0: + scale = torch.tensor(1.0, device=x.device, dtype=torch.float32) + q = torch.zeros_like(x, dtype=torch.int8) + return q, scale, None + scale = absmax / 127 + x_q = x.mul(127 / absmax) + x_q = torch.round(x_q).to(torch.int8) + + return x_q, scale, None + + +def test( + handle, + device, + x_shape, + x_stride, + x_packed_stride, + symmetric, + is_static, + dtype=InfiniDtype.F16, + sync=None, +): + + print( + f"Testing Per Tensor Quant Int8 on {InfiniDeviceNames[device]} with x_shape:{x_shape}, x_stride:{x_stride}, x_packed_stride:{x_packed_stride}, symmetric:{symmetric}, is_static:{is_static}, dtype:{InfiniDtypeNames[dtype]}" + ) + + x = TestTensor(x_shape, x_stride, dtype, device) + x_packed = TestTensor( + x_shape, x_packed_stride, InfiniDtype.I8, device, mode="zeros" + ) + if is_static == False: + x_scale = TestTensor((1,), None, InfiniDtype.F32, device, mode="zeros") + else: + x_scale = TestTensor((1,), None, InfiniDtype.F32, device) + if symmetric: + x_zero = None + else: + x_zero = TestTensor((1,), None, InfiniDtype.F32, device) + if sync is not None: + sync() + + x_p, x_s, x_z = per_tensor_quant_int8_torch( + x.torch_tensor(), x_scale.torch_tensor(), symmetric, is_static + ) + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreatePerTensorQuantI8Descriptor( + handle, + ctypes.byref(descriptor), + x_packed.descriptor, + x_scale.descriptor, + None if symmetric else x_zero.descriptor, + x.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + + x_packed.destroy_desc() + x_scale.destroy_desc() + if symmetric == False: + x_zero.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetPerTensorQuantI8WorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, x.device) + + def lib_per_tensor_quant_int8(): + check_error( + LIBINFINIOP.infiniopPerTensorQuantI8( + descriptor, + workspace.data(), + workspace_size.value, + x_packed.data(), + x_scale.data(), + None if symmetric else x_zero.data(), + x.data(), + is_static, + None, + ) + ) + + lib_per_tensor_quant_int8() + + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(x_packed.actual_tensor(), x_p, atol=2, rtol=0) + debug(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol) + if symmetric == False: + debug(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol) + + if symmetric: + assert torch.allclose( + x_packed.actual_tensor(), x_p, atol=2, rtol=0 + ) and torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol) + else: + assert ( + torch.allclose(x_packed.actual_tensor(), x_p, atol=2, rtol=0) + and torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol) + and torch.allclose(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol) + ) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: per_tensor_quant_int8_torch(x.torch_tensor(), x_scale.torch_tensor(), symmetric, is_static), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_per_tensor_quant_int8(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyPerTensorQuantI8Descriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m")