diff --git a/include/infiniop/ops/dequant/per_tensor_dequant_int8.h b/include/infiniop/ops/dequant/per_tensor_dequant_int8.h
new file mode 100644
index 000000000..9614b4303
--- /dev/null
+++ b/include/infiniop/ops/dequant/per_tensor_dequant_int8.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_PER_TENSOR_DEQUANT_INT8_API_H__
+#define __INFINIOP_PER_TENSOR_DEQUANT_INT8_API_H__
+
+#include "../../operator_descriptor.h"
+
+typedef InfiniopDescriptor *infiniopPerTensorDequantI8Descriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreatePerTensorDequantI8Descriptor(infiniopHandle_t handle,
+                                                                              infiniopPerTensorDequantI8Descriptor_t *desc_ptr,
+                                                                              infiniopTensorDescriptor_t x_desc,
+                                                                              infiniopTensorDescriptor_t x_packed_desc,
+                                                                              infiniopTensorDescriptor_t x_scale_desc,
+                                                                              infiniopTensorDescriptor_t x_zero_desc);
+
+__INFINI_C __export infiniStatus_t infiniopGetPerTensorDequantI8WorkspaceSize(infiniopPerTensorDequantI8Descriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopPerTensorDequantI8(infiniopPerTensorDequantI8Descriptor_t desc,
+                                                              void *workspace,
+                                                              size_t workspace_size,
+                                                              void *x,
+                                                              const void *x_packed,
+                                                              const void *x_scale,
+                                                              const void *x_zero,
+                                                              void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyPerTensorDequantI8Descriptor(infiniopPerTensorDequantI8Descriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/quant/per_tensor_quant_int8.h b/include/infiniop/ops/quant/per_tensor_quant_int8.h
new file mode 100644
index 000000000..16e1c2bc6
--- /dev/null
+++ b/include/infiniop/ops/quant/per_tensor_quant_int8.h
@@ -0,0 +1,29 @@
+#ifndef __INFINIOP_PER_TENSOR_QUANT_INT8_API_H__
+#define __INFINIOP_PER_TENSOR_QUANT_INT8_API_H__
+
+#include "../../operator_descriptor.h"
+
+typedef InfiniopDescriptor *infiniopPerTensorQuantI8Descriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreatePerTensorQuantI8Descriptor(infiniopHandle_t handle,
+                                                                            infiniopPerTensorQuantI8Descriptor_t *desc_ptr,
+                                                                            infiniopTensorDescriptor_t x_packed_desc,
+                                                                            infiniopTensorDescriptor_t x_scale_desc,
+                                                                            infiniopTensorDescriptor_t x_zero_desc,
+                                                                            infiniopTensorDescriptor_t x_desc);
+
+__INFINI_C __export infiniStatus_t infiniopGetPerTensorQuantI8WorkspaceSize(infiniopPerTensorQuantI8Descriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopPerTensorQuantI8(infiniopPerTensorQuantI8Descriptor_t desc,
+                                                            void *workspace,
+                                                            size_t workspace_size,
+                                                            void *x_packed,
+                                                            void *x_scale,
+                                                            void *x_zero,
+                                                            const void *x,
+                                                            const bool is_static,
+                                                            void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyPerTensorQuantI8Descriptor(infiniopPerTensorQuantI8Descriptor_t desc);
+
+#endif
diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/cuda/kernel.cuh b/src/infiniop/ops/dequant/per_tensor_dequant_int8/cuda/kernel.cuh
new file mode 100644
index 000000000..ea1acaa2d
--- /dev/null
+++ b/src/infiniop/ops/dequant/per_tensor_dequant_int8/cuda/kernel.cuh
@@ -0,0 +1,35 @@
+#ifndef __PER_TENSOR_DEQUANT_INT8_KERNEL_CUH__
+#define __PER_TENSOR_DEQUANT_INT8_KERNEL_CUH__
+
+template <typename Tin, typename Tout>
+__device__ void perTensorDequantI8SymKernel(
+    Tout *x, const Tin *x_packed, const float *x_scale,
+    size_t batch_size, size_t channel, size_t hidden_dim, size_t width,
+    ptrdiff_t strides_0, ptrdiff_t strides_1, ptrdiff_t strides_2, ptrdiff_t strides_3,
+    ptrdiff_t p_strides_0, ptrdiff_t p_strides_1, ptrdiff_t p_strides_2, ptrdiff_t p_strides_3,
+    int num_elements) {
+
+    unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x;
+    const int grid_size = blockDim.x * gridDim.x;
+    float x_scale_val = x_scale[0];
+    for (int tid = gid; tid < num_elements; tid += grid_size) {
+        int w = tid % (int)width;
+        tid = tid / (int)width;
+
+        int h = tid % (int)hidden_dim;
+        tid = tid / (int)hidden_dim;
+
+        int c = tid % (int)channel;
+        tid = tid / (int)channel;
+
+        int b = tid % (int)batch_size;
+
+        int index = w * (int)strides_3 + h * (int)strides_2 + c * (int)strides_1 + b * (int)strides_0;
+        int p_index = w * (int)p_strides_3 + h * (int)p_strides_2 + c * (int)p_strides_1 + b * (int)p_strides_0;
+
+        float val = static_cast<float>(x_packed[p_index]) * x_scale_val;
+        x[index] = static_cast<Tout>(val);
+    }
+}
+
+#endif // __PER_TENSOR_DEQUANT_INT8_KERNEL_CUH__
diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/info.h b/src/infiniop/ops/dequant/per_tensor_dequant_int8/info.h
new file mode 100644
index 000000000..4614f5f40
--- /dev/null
+++ b/src/infiniop/ops/dequant/per_tensor_dequant_int8/info.h
@@ -0,0 +1,76 @@
+#ifndef __PER_TENSOR_DEQUANT_INT8_INFO_H__
+#define __PER_TENSOR_DEQUANT_INT8_INFO_H__
+
+#include "../../../../utils.h"
+#include "../../../operator.h"
+#include "../../../tensor.h"
+
+namespace op::per_tensor_dequant_int8 {
+
+class PerTensorDequantI8Info {
+private:
+    PerTensorDequantI8Info() = default;
+
+public:
+    infiniDtype_t dtype, packed_type;
+    size_t batch_size, channel, hidden_dim, width;
+    ptrdiff_t strides_0, strides_1, strides_2, strides_3;
+    ptrdiff_t p_strides_0, p_strides_1, p_strides_2, p_strides_3;
+    int num_elements;
+
+    static utils::Result<PerTensorDequantI8Info> createPerTensorDequantI8Info(
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t x_packed_desc,
+        infiniopTensorDescriptor_t x_scale_desc,
+        infiniopTensorDescriptor_t x_zero_desc) {
+
+        CHECK_OR_RETURN(
+            x_packed_desc != nullptr && x_scale_desc != nullptr && x_desc != nullptr,
+            INFINI_STATUS_NULL_POINTER);
+
+        const infiniDtype_t dtype = x_desc->dtype();
+        const infiniDtype_t packed_type = x_packed_desc->dtype();
+
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
+        CHECK_DTYPE(packed_type, INFINI_DTYPE_I8);
+
+        auto shape = x_desc->shape();
+        CHECK_SAME_SHAPE(shape, x_packed_desc->shape());
+
+        auto ndim = x_desc->ndim();
+        CHECK_OR_RETURN(ndim <= 4,
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        size_t width = shape[ndim - 1];
+        size_t hidden_dim = (ndim > 1 ? shape[ndim - 2] : 1);
+        size_t channel = (ndim > 2 ? shape[ndim - 3] : 1);
+        size_t batch_size = (ndim > 3 ? shape[ndim - 4] : 1);
+
+        ptrdiff_t strides_3 = x_desc->strides()[ndim - 1];
+        ptrdiff_t strides_2 = (ndim > 1 ? x_desc->strides()[ndim - 2] : 0);
+        ptrdiff_t strides_1 = (ndim > 2 ? x_desc->strides()[ndim - 3] : 0);
+        ptrdiff_t strides_0 = (ndim > 3 ? x_desc->strides()[ndim - 4] : 0);
+
+        ptrdiff_t p_strides_3 = x_packed_desc->strides()[ndim - 1];
+        ptrdiff_t p_strides_2 = (ndim > 1 ? x_packed_desc->strides()[ndim - 2] : 0);
+        ptrdiff_t p_strides_1 = (ndim > 2 ? x_packed_desc->strides()[ndim - 3] : 0);
+        ptrdiff_t p_strides_0 = (ndim > 3 ? x_packed_desc->strides()[ndim - 4] : 0);
+
+        int num_elements = 1;
+        for (int i = 0; i < (int)ndim; i++) {
+            num_elements *= static_cast<int>(shape[i]);
+        }
+
+        return utils::Result<PerTensorDequantI8Info>(PerTensorDequantI8Info{
+            dtype,
+            packed_type,
+            batch_size, channel, hidden_dim, width,
+            strides_0, strides_1, strides_2, strides_3,
+            p_strides_0, p_strides_1, p_strides_2, p_strides_3,
+            num_elements});
+    }
+};
+
+} // namespace op::per_tensor_dequant_int8
+
+#endif //  __PER_TENSOR_DEQUANT_INT8_INFO_H__
diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/nvidia/per_tensor_dequant_int8_nvidia.cu b/src/infiniop/ops/dequant/per_tensor_dequant_int8/nvidia/per_tensor_dequant_int8_nvidia.cu
new file mode 100644
index 000000000..3f62e1b33
--- /dev/null
+++ b/src/infiniop/ops/dequant/per_tensor_dequant_int8/nvidia/per_tensor_dequant_int8_nvidia.cu
@@ -0,0 +1,114 @@
+#include "../../../../devices/nvidia/nvidia_common.cuh"
+#include "per_tensor_dequant_int8_nvidia.cuh"
+
+#include "../../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../../../../reduce/cuda/reduce.cuh"
+#include <cub/block/block_reduce.cuh>
+
+#include "../cuda/kernel.cuh"
+
+template <typename Tin, typename Tout>
+INFINIOP_CUDA_KERNEL perTensorDequantI8Sym(
+    Tout *x, const Tin *x_packed, const float *x_scale,
+    size_t batch_size, size_t channel, size_t hidden_dim, size_t width,
+    ptrdiff_t strides_0, ptrdiff_t strides_1, ptrdiff_t strides_2, ptrdiff_t strides_3,
+    ptrdiff_t p_strides_0, ptrdiff_t p_strides_1, ptrdiff_t p_strides_2, ptrdiff_t p_strides_3,
+    int num_elements) {
+    perTensorDequantI8SymKernel<Tin, Tout>(x, x_packed, x_scale,
+                                           batch_size, channel, hidden_dim, width,
+                                           strides_0, strides_1, strides_2, strides_3,
+                                           p_strides_0, p_strides_1, p_strides_2, p_strides_3,
+                                           num_elements);
+}
+
+namespace op::per_tensor_dequant_int8::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle, Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t x_packed_desc,
+    infiniopTensorDescriptor_t x_scale_desc,
+    infiniopTensorDescriptor_t x_zero_desc) {
+    auto info = PerTensorDequantI8Info::createPerTensorDequantI8Info(x_desc, x_packed_desc, x_scale_desc, x_zero_desc);
+    CHECK_RESULT(info);
+
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t per_tensor_dequant_int8Kernel(const PerTensorDequantI8Info &info, Tdata *x, const int8_t *x_packed, const float *x_scale, const float *x_zero, cudaStream_t stream) {
+    int num_elements = (int)info.num_elements;
+    int num_blocks = (num_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    size_t batch_size = info.batch_size;
+    size_t channel = info.channel;
+    size_t hidden_dim = info.hidden_dim;
+    size_t width = info.width;
+
+    ptrdiff_t strides_0 = info.strides_0;
+    ptrdiff_t strides_1 = info.strides_1;
+    ptrdiff_t strides_2 = info.strides_2;
+    ptrdiff_t strides_3 = info.strides_3;
+
+    ptrdiff_t p_strides_0 = info.p_strides_0;
+    ptrdiff_t p_strides_1 = info.p_strides_1;
+    ptrdiff_t p_strides_2 = info.p_strides_2;
+    ptrdiff_t p_strides_3 = info.p_strides_3;
+
+    if (x_zero == nullptr) {
+        perTensorDequantI8Sym<int8_t, Tdata>
+            <<<num_blocks, BLOCK_SIZE, 0, stream>>>(x, x_packed, x_scale,
+                                                    batch_size, channel, hidden_dim, width,
+                                                    strides_0, strides_1, strides_2, strides_3,
+                                                    p_strides_0, p_strides_1, p_strides_2, p_strides_3,
+                                                    num_elements);
+    } else {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *x,
+                                     const void *x_packed,
+                                     const void *x_scale,
+                                     const void *x_zero,
+                                     void *stream_) const {
+    cudaStream_t stream = (cudaStream_t)stream_;
+#define DEQUANT(BLOCK_SIZE, TDATA) \
+    per_tensor_dequant_int8Kernel<BLOCK_SIZE, TDATA>(_info, (TDATA *)x, (const int8_t *)x_packed, (const float *)x_scale, (const float *)x_zero, stream)
+#define DEQUANT_WITH_BLOCK_SIZE(BLOCK_SIZE)            \
+    {                                                  \
+        if (_info.dtype == INFINI_DTYPE_F16)           \
+            return DEQUANT(BLOCK_SIZE, half);          \
+        else if (_info.dtype == INFINI_DTYPE_F32)      \
+            return DEQUANT(BLOCK_SIZE, float);         \
+        else if (_info.dtype == INFINI_DTYPE_BF16)     \
+            return DEQUANT(BLOCK_SIZE, __nv_bfloat16); \
+        else                                           \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;     \
+    }
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        DEQUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        DEQUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        DEQUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::per_tensor_dequant_int8::nvidia
diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/nvidia/per_tensor_dequant_int8_nvidia.cuh b/src/infiniop/ops/dequant/per_tensor_dequant_int8/nvidia/per_tensor_dequant_int8_nvidia.cuh
new file mode 100644
index 000000000..66a7e5d03
--- /dev/null
+++ b/src/infiniop/ops/dequant/per_tensor_dequant_int8/nvidia/per_tensor_dequant_int8_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __PER_TENSOR_DEQUANT_INT8_NVIDIA_API_H__
+#define __PER_TENSOR_DEQUANT_INT8_NVIDIA_API_H__
+#include "../per_tensor_dequant_int8.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __PER_TENSOR_DEQUANT_INT8_NVIDIA_API_H__
diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/operator.cc b/src/infiniop/ops/dequant/per_tensor_dequant_int8/operator.cc
new file mode 100644
index 000000000..48d416847
--- /dev/null
+++ b/src/infiniop/ops/dequant/per_tensor_dequant_int8/operator.cc
@@ -0,0 +1,102 @@
+#include "../../../operator.h"
+#include "../../../handle.h"
+#include "infiniop/ops/dequant/per_tensor_dequant_int8.h"
+
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/per_tensor_dequant_int8_nvidia.cuh"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreatePerTensorDequantI8Descriptor(infiniopHandle_t handle,
+                                                                     infiniopPerTensorDequantI8Descriptor_t *desc_ptr,
+                                                                     infiniopTensorDescriptor_t x_desc,
+                                                                     infiniopTensorDescriptor_t x_packed_desc,
+                                                                     infiniopTensorDescriptor_t x_scale_desc,
+                                                                     infiniopTensorDescriptor_t x_zero_desc) {
+#define CREATE(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                                 \
+        return op::per_tensor_dequant_int8::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                            \
+            reinterpret_cast<op::per_tensor_dequant_int8::NAMESPACE::Descriptor **>(desc_ptr), \
+            x_desc,                                                                            \
+            x_packed_desc,                                                                     \
+            x_scale_desc,                                                                      \
+            x_zero_desc);
+    switch (handle->device) {
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia)
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetPerTensorDequantI8WorkspaceSize(infiniopPerTensorDequantI8Descriptor_t desc, size_t *size) {
+    switch (desc->device_type) {
+#define GET(CASE, NAMESPACE)                                                                                      \
+    case CASE:                                                                                                    \
+        *size = reinterpret_cast<op::per_tensor_dequant_int8::NAMESPACE::Descriptor *>(desc)->minWorkspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__INFINI_C infiniStatus_t infiniopPerTensorDequantI8(infiniopPerTensorDequantI8Descriptor_t desc,
+                                                     void *workspace,
+                                                     size_t workspace_size,
+                                                     void *x,
+                                                     const void *x_packed,
+                                                     const void *x_scale,
+                                                     const void *x_zero,
+                                                     void *stream) {
+#define DEQUANT(CASE, NAMESPACE)                                                                        \
+    case CASE:                                                                                          \
+        return reinterpret_cast<op::per_tensor_dequant_int8::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, x, x_packed, x_scale, x_zero, stream);
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        DEQUANT(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        DEQUANT(INFINI_DEVICE_QY, nvidia)
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DEQUANT
+}
+
+__INFINI_C infiniStatus_t infiniopDestroyPerTensorDequantI8Descriptor(infiniopPerTensorDequantI8Descriptor_t desc) {
+#define DESTROY(CASE, NAMESPACE)                                                             \
+    case CASE:                                                                               \
+        delete reinterpret_cast<op::per_tensor_dequant_int8::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        DESTROY(INFINI_DEVICE_QY, nvidia)
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DESTROY
+}
diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/per_tensor_dequant_int8.h b/src/infiniop/ops/dequant/per_tensor_dequant_int8/per_tensor_dequant_int8.h
new file mode 100644
index 000000000..1ed54a8e5
--- /dev/null
+++ b/src/infiniop/ops/dequant/per_tensor_dequant_int8/per_tensor_dequant_int8.h
@@ -0,0 +1,40 @@
+#ifndef __PER_TENSOR_DEQUANT_I8_H__
+#define __PER_TENSOR_DEQUANT_I8_H__
+
+#include "../../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                                                            \
+                                                                                                         \
+    namespace op::per_tensor_dequant_int8::NAMESPACE {                                                   \
+    class Descriptor final : public InfiniopDescriptor {                                                 \
+        struct Opaque;                                                                                   \
+        Opaque *_opaque;                                                                                 \
+        PerTensorDequantI8Info _info;                                                                    \
+        size_t _workspace_size;                                                                          \
+                                                                                                         \
+        Descriptor(Opaque *opaque, PerTensorDequantI8Info info,                                          \
+                   size_t workspace_size,                                                                \
+                   infiniDevice_t device_type, int device_id)                                            \
+            : InfiniopDescriptor{device_type, device_id},                                                \
+              _opaque(opaque), _info(info), _workspace_size(workspace_size) {}                           \
+                                                                                                         \
+    public:                                                                                              \
+        ~Descriptor();                                                                                   \
+                                                                                                         \
+        size_t minWorkspaceSize() const { return _workspace_size; }                                      \
+                                                                                                         \
+        static infiniStatus_t create(                                                                    \
+            infiniopHandle_t handle, Descriptor **desc_ptr,                                              \
+            infiniopTensorDescriptor_t x_desc,                                                           \
+            infiniopTensorDescriptor_t x_packed_desc,                                                    \
+            infiniopTensorDescriptor_t x_scale_desc,                                                     \
+            infiniopTensorDescriptor_t x_zero_desc);                                                     \
+                                                                                                         \
+        infiniStatus_t calculate(                                                                        \
+            void *workspace, size_t workspace_size,                                                      \
+            void *x, const void *x_packed, const void *x_scale, const void *x_zero, void *stream) const; \
+    };                                                                                                   \
+    }
+
+#endif // __PER_TENSOR_DEQUANT_I8_H__
diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh b/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh
index 3c014de9b..91f36e4f9 100644
--- a/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh
@@ -1,5 +1,5 @@
-#ifndef __PERCHANNEL_QUANTINT8_KERNEL_CUH__
-#define __PERCHANNEL_QUANTINT8_KERNEL_CUH__
+#ifndef __PER_CHANNEL_QUANT_INT8_KERNEL_CUH__
+#define __PER_CHANNEL_QUANT_INT8_KERNEL_CUH__
 
 #include <cub/block/block_reduce.cuh>
 __device__ inline int round_half_away_from_zero(float x) {
@@ -55,8 +55,8 @@ __device__ void blockPerChannelQuantI8Kernel(
     float inv_scale = 1.0f / scale;
     float zero = -global_min * inv_scale - 128.0f;
 
-    x_scale[row] = (Tdata)scale;
-    x_zero[row] = (Tdata)zero;
+    x_scale[row] = scale;
+    x_zero[row] = zero;
 
     for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) {
 
@@ -111,7 +111,7 @@ __device__ void blockPerChannelQuantI8SymKernel(
 
     float inv_scale = 1.0f / scale;
 
-    x_scale[row] = (Tdata)scale;
+    x_scale[row] = scale;
 
     for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) {
 
@@ -270,4 +270,4 @@ __device__ void warpPerChannelQuantI8SymKernel(
     }
 }
 
-#endif // __PERCHANNEL_QUANTINT8_KERNEL_CUH__
+#endif // __PER_CHANNEL_QUANT_INT8_KERNEL_CUH__
diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h b/src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h
index 4d1675c8c..4f1d3be2c 100644
--- a/src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h
@@ -1,5 +1,5 @@
-#ifndef __QUANT_H__
-#define __QUANT_H__
+#ifndef __PER_CHANNEL_QUANT_INT8_H__
+#define __PER_CHANNEL_QUANT_INT8_H__
 
 #include "../../../operator.h"
 #include "info.h"
@@ -37,4 +37,4 @@
     };                                                                                       \
     }
 
-#endif // __QUANT_H__
+#endif // __PER_CHANNEL_QUANT_INT8_H__
diff --git a/src/infiniop/ops/quant/per_tensor_quant_int8/cuda/kernel.cuh b/src/infiniop/ops/quant/per_tensor_quant_int8/cuda/kernel.cuh
new file mode 100644
index 000000000..bb2a8d304
--- /dev/null
+++ b/src/infiniop/ops/quant/per_tensor_quant_int8/cuda/kernel.cuh
@@ -0,0 +1,130 @@
+#ifndef __PER_TENSOR_QUANT_INT8_KERNEL_CUH__
+#define __PER_TENSOR_QUANT_INT8_KERNEL_CUH__
+
+#include <cub/block/block_reduce.cuh>
+
+#ifndef WARP_SIZE
+#define WARP_SIZE 32
+#endif
+
+#define FULL_MASK 0xffffffff
+
+
+// warp reduce max
+__device__ __forceinline__ float warpReduceMax(float val)
+{
+    for (int offset = WARP_SIZE/2; offset > 0; offset /= 2)
+        val = fmaxf(val, __shfl_xor_sync(FULL_MASK, val, offset));
+    return val;
+}
+
+
+// float atomic max (safe version)
+__device__ __forceinline__ void atomicMaxFloat(float* addr, float val)
+{
+    int* addr_i = (int*)addr;
+    int old = *addr_i;
+    int assumed;
+
+    do
+    {
+        assumed = old;
+        float old_f = __int_as_float(assumed);
+        float new_f = fmaxf(val, old_f);
+
+        old = atomicCAS(addr_i, assumed, __float_as_int(new_f));
+
+    } while (assumed != old);
+}
+
+__device__ inline int round_half_away_from_zero(float x) {
+    float ax = fabsf(x);
+    float r = floorf(ax + 0.5f);
+    return (x >= 0.0f) ? (int)r : -(int)r;
+}
+
+template <typename Tdata, unsigned int BLOCK_SIZE>
+__device__ void perTensorAbsmaxSymKernel(float *x_scale, const Tdata *x,
+                                         size_t batch_size, size_t channel, size_t hidden_dim, size_t width,
+                                         ptrdiff_t strides_0, ptrdiff_t strides_1, ptrdiff_t strides_2, ptrdiff_t strides_3,
+                                         int num_elements) {
+    int tid = threadIdx.x;
+    int gid = blockIdx.x * blockDim.x + tid;
+    int grid_size = blockDim.x * gridDim.x;
+
+    float local_max = 0.f;
+
+    // grid-stride loop
+    for (int tid = gid; tid < num_elements; tid += grid_size)
+    {
+        int w = tid % (int)width;
+        tid = tid / (int)width;
+
+        int h = tid % (int)hidden_dim;
+        tid = tid / (int)hidden_dim;
+
+        int c = tid % (int)channel;
+        tid = tid / (int)channel;
+
+        int b = tid % (int)batch_size;
+
+        int index = w * (int)strides_3 + h * (int)strides_2 + c * (int)strides_1 + b * (int)strides_0;
+
+        float v = fabsf((float)x[index]);
+
+        local_max = fmaxf(local_max, v);
+    }
+
+    // warp reduction
+    local_max = warpReduceMax(local_max);
+    // 每个 warp 只 atomic 一次
+    if ((tid & (WARP_SIZE - 1)) == 0)
+    {
+        atomicMaxFloat(x_scale, local_max / 127.0f);
+    }
+
+}
+
+template <typename Tdata, unsigned int BLOCK_SIZE>
+__device__ void perTensorQuantI8SymKernel(
+    int8_t *x_packed, float *x_scale, const Tdata *x,
+    size_t batch_size, size_t channel, size_t hidden_dim, size_t width,
+    ptrdiff_t strides_0, ptrdiff_t strides_1, ptrdiff_t strides_2, ptrdiff_t strides_3,
+    ptrdiff_t p_strides_0, ptrdiff_t p_strides_1, ptrdiff_t p_strides_2, ptrdiff_t p_strides_3,
+    int num_elements) {
+
+    unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x;
+    const int grid_size = blockDim.x * gridDim.x;
+
+    float scale_val = 1.0f / x_scale[0];
+
+    for (int tid = gid; tid < num_elements; tid += grid_size) {
+        int w = tid % (int)width;
+        tid = tid / (int)width;
+
+        int h = tid % (int)hidden_dim;
+        tid = tid / (int)hidden_dim;
+
+        int c = tid % (int)channel;
+        tid = tid / (int)channel;
+
+        int b = tid % (int)batch_size;
+
+        int index = w * (int)strides_3 + h * (int)strides_2 + c * (int)strides_1 + b * (int)strides_0;
+        int p_index = w * (int)p_strides_3 + h * (int)p_strides_2 + c * (int)p_strides_1 + b * (int)p_strides_0;
+
+        float qf = (float)x[index] * scale_val;
+        int q = round_half_away_from_zero(qf);
+
+        if (q > 127) {
+            q = 127;
+        }
+        if (q < -127) {
+            q = -127;
+        }
+
+        x_packed[p_index] = (int8_t)q;
+    }
+}
+
+#endif // __PER_TENSOR_QUANT_INT8_KERNEL_CUH__
diff --git a/src/infiniop/ops/quant/per_tensor_quant_int8/info.h b/src/infiniop/ops/quant/per_tensor_quant_int8/info.h
new file mode 100644
index 000000000..f6ed0d067
--- /dev/null
+++ b/src/infiniop/ops/quant/per_tensor_quant_int8/info.h
@@ -0,0 +1,77 @@
+#ifndef __PER_TENSOR_QUANT_INT8_INFO_H__
+#define __PER_TENSOR_QUANT_INT8_INFO_H__
+
+#include "../../../../utils.h"
+#include "../../../operator.h"
+#include "../../../tensor.h"
+
+namespace op::per_tensor_quant_int8 {
+
+class PerTensorQuantI8Info {
+private:
+    PerTensorQuantI8Info() = default;
+
+public:
+    infiniDtype_t dtype, packed_type;
+    size_t batch_size, channel, hidden_dim, width;
+    ptrdiff_t strides_0, strides_1, strides_2, strides_3;
+    ptrdiff_t p_strides_0, p_strides_1, p_strides_2, p_strides_3;
+    int num_elements;
+    bool is_static;
+
+    static utils::Result<PerTensorQuantI8Info> createPerTensorQuantI8Info(
+        infiniopTensorDescriptor_t x_packed_desc,
+        infiniopTensorDescriptor_t x_scale_desc,
+        infiniopTensorDescriptor_t x_zero_desc,
+        infiniopTensorDescriptor_t x_desc) {
+
+        CHECK_OR_RETURN(
+            x_packed_desc != nullptr && x_scale_desc != nullptr && x_desc != nullptr,
+            INFINI_STATUS_NULL_POINTER);
+
+        const infiniDtype_t dtype = x_desc->dtype();
+        const infiniDtype_t packed_type = x_packed_desc->dtype();
+
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
+        CHECK_DTYPE(packed_type, INFINI_DTYPE_I8);
+
+        auto shape = x_desc->shape();
+        CHECK_SAME_SHAPE(shape, x_packed_desc->shape());
+
+        auto ndim = x_desc->ndim();
+        CHECK_OR_RETURN(ndim <= 4,
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        size_t width = shape[ndim - 1];
+        size_t hidden_dim = (ndim > 1 ? shape[ndim - 2] : 1);
+        size_t channel = (ndim > 2 ? shape[ndim - 3] : 1);
+        size_t batch_size = (ndim > 3 ? shape[ndim - 4] : 1);
+
+        ptrdiff_t strides_3 = x_desc->strides()[ndim - 1];
+        ptrdiff_t strides_2 = (ndim > 1 ? x_desc->strides()[ndim - 2] : 0);
+        ptrdiff_t strides_1 = (ndim > 2 ? x_desc->strides()[ndim - 3] : 0);
+        ptrdiff_t strides_0 = (ndim > 3 ? x_desc->strides()[ndim - 4] : 0);
+
+        ptrdiff_t p_strides_3 = x_packed_desc->strides()[ndim - 1];
+        ptrdiff_t p_strides_2 = (ndim > 1 ? x_packed_desc->strides()[ndim - 2] : 0);
+        ptrdiff_t p_strides_1 = (ndim > 2 ? x_packed_desc->strides()[ndim - 3] : 0);
+        ptrdiff_t p_strides_0 = (ndim > 3 ? x_packed_desc->strides()[ndim - 4] : 0);
+
+        int num_elements = 1;
+        for (int i = 0; i < (int)ndim; i++) {
+            num_elements *= static_cast<int>(shape[i]);
+        }
+
+        return utils::Result<PerTensorQuantI8Info>(PerTensorQuantI8Info{
+            dtype,
+            packed_type,
+            batch_size, channel, hidden_dim, width,
+            strides_0, strides_1, strides_2, strides_3,
+            p_strides_0, p_strides_1, p_strides_2, p_strides_3,
+            num_elements});
+    }
+};
+
+} // namespace op::per_tensor_quant_int8
+
+#endif //  __PER_TENSOR_QUANT_INT8_INFO_H__
diff --git a/src/infiniop/ops/quant/per_tensor_quant_int8/nvidia/per_tensor_quant_int8_nvidia.cu b/src/infiniop/ops/quant/per_tensor_quant_int8/nvidia/per_tensor_quant_int8_nvidia.cu
new file mode 100644
index 000000000..a4aa6813d
--- /dev/null
+++ b/src/infiniop/ops/quant/per_tensor_quant_int8/nvidia/per_tensor_quant_int8_nvidia.cu
@@ -0,0 +1,130 @@
+#include "../../../../devices/nvidia/nvidia_common.cuh"
+#include "per_tensor_quant_int8_nvidia.cuh"
+
+#include "../../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../../../../reduce/cuda/reduce.cuh"
+#include <cub/block/block_reduce.cuh>
+
+#include "../cuda/kernel.cuh"
+
+template <typename Tdata, unsigned int BLOCK_SIZE>
+INFINIOP_CUDA_KERNEL perTensorAbsmaxSym(
+    float *x_scale, const Tdata *x,
+    size_t batch_size, size_t channel, size_t hidden_dim, size_t width,
+    ptrdiff_t strides_0, ptrdiff_t strides_1, ptrdiff_t strides_2, ptrdiff_t strides_3,
+    int num_elements) {
+    perTensorAbsmaxSymKernel<Tdata, BLOCK_SIZE>(x_scale, x,
+                                                batch_size, channel, hidden_dim, width,
+                                                strides_0, strides_1, strides_2, strides_3,
+                                                num_elements);
+}
+
+template <typename Tdata, unsigned int BLOCK_SIZE>
+INFINIOP_CUDA_KERNEL perTensorQuantI8Sym(
+    int8_t *x_packed, float *x_scale, const Tdata *x,
+    size_t batch_size, size_t channel, size_t hidden_dim, size_t width,
+    ptrdiff_t strides_0, ptrdiff_t strides_1, ptrdiff_t strides_2, ptrdiff_t strides_3,
+    ptrdiff_t p_strides_0, ptrdiff_t p_strides_1, ptrdiff_t p_strides_2, ptrdiff_t p_strides_3,
+    int num_elements) {
+    perTensorQuantI8SymKernel<Tdata, BLOCK_SIZE>(x_packed, x_scale, x,
+                                                 batch_size, channel, hidden_dim, width,
+                                                 strides_0, strides_1, strides_2, strides_3,
+                                                 p_strides_0, p_strides_1, p_strides_2, p_strides_3,
+                                                 num_elements);
+}
+
+namespace op::per_tensor_quant_int8::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle, Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_packed_desc,
+    infiniopTensorDescriptor_t x_scale_desc,
+    infiniopTensorDescriptor_t x_zero_desc,
+    infiniopTensorDescriptor_t x_desc) {
+    auto info = PerTensorQuantI8Info::createPerTensorQuantI8Info(x_packed_desc, x_scale_desc, x_zero_desc, x_desc);
+    CHECK_RESULT(info);
+
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t per_tensor_quant_int8Kernel(const PerTensorQuantI8Info &info, int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, const bool is_static, cudaStream_t stream) {
+    int num_elements = (int)info.num_elements;
+    int num_blocks = (num_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    size_t batch_size = info.batch_size;
+    size_t channel = info.channel;
+    size_t hidden_dim = info.hidden_dim;
+    size_t width = info.width;
+
+    ptrdiff_t strides_0 = info.strides_0;
+    ptrdiff_t strides_1 = info.strides_1;
+    ptrdiff_t strides_2 = info.strides_2;
+    ptrdiff_t strides_3 = info.strides_3;
+
+    ptrdiff_t p_strides_0 = info.p_strides_0;
+    ptrdiff_t p_strides_1 = info.p_strides_1;
+    ptrdiff_t p_strides_2 = info.p_strides_2;
+    ptrdiff_t p_strides_3 = info.p_strides_3;
+
+    if (x_zero == nullptr) {
+        if (is_static == false) {
+            perTensorAbsmaxSym<Tdata, BLOCK_SIZE>
+                <<<num_blocks, BLOCK_SIZE, 0, stream>>>(x_scale, x,
+                                                        batch_size, channel, hidden_dim, width,
+                                                        strides_0, strides_1, strides_2, strides_3,
+                                                        num_elements);
+        }
+        perTensorQuantI8Sym<Tdata, BLOCK_SIZE>
+            <<<num_blocks, BLOCK_SIZE, 0, stream>>>(x_packed, x_scale, x,
+                                                    batch_size, channel, hidden_dim, width,
+                                                    strides_0, strides_1, strides_2, strides_3,
+                                                    p_strides_0, p_strides_1, p_strides_2, p_strides_3,
+                                                    num_elements);
+    } else {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *x_packed, void *x_scale, void *x_zero, const void *x, const bool is_static,
+                                     void *stream_) const {
+    cudaStream_t stream = (cudaStream_t)stream_;
+#define QUANT(BLOCK_SIZE, TDATA) \
+    per_tensor_quant_int8Kernel<BLOCK_SIZE, TDATA>(_info, (int8_t *)x_packed, (float *)x_scale, (float *)x_zero, (const TDATA *)x, is_static, stream)
+#define QUANT_WITH_BLOCK_SIZE(BLOCK_SIZE)            \
+    {                                                \
+        if (_info.dtype == INFINI_DTYPE_F16)         \
+            return QUANT(BLOCK_SIZE, half);          \
+        else if (_info.dtype == INFINI_DTYPE_F32)    \
+            return QUANT(BLOCK_SIZE, float);         \
+        else if (_info.dtype == INFINI_DTYPE_BF16)   \
+            return QUANT(BLOCK_SIZE, __nv_bfloat16); \
+        else                                         \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;   \
+    }
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        QUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        QUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        QUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::per_tensor_quant_int8::nvidia
diff --git a/src/infiniop/ops/quant/per_tensor_quant_int8/nvidia/per_tensor_quant_int8_nvidia.cuh b/src/infiniop/ops/quant/per_tensor_quant_int8/nvidia/per_tensor_quant_int8_nvidia.cuh
new file mode 100644
index 000000000..4137c2d47
--- /dev/null
+++ b/src/infiniop/ops/quant/per_tensor_quant_int8/nvidia/per_tensor_quant_int8_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __PER_TENSOR_QUANT_INT8_NVIDIA_API_H__
+#define __PER_TENSOR_QUANT_INT8_NVIDIA_API_H__
+#include "../per_tensor_quant_int8.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __PER_TENSOR_QUANT_INT8_NVIDIA_API_H__
diff --git a/src/infiniop/ops/quant/per_tensor_quant_int8/operator.cc b/src/infiniop/ops/quant/per_tensor_quant_int8/operator.cc
new file mode 100644
index 000000000..364fbe44b
--- /dev/null
+++ b/src/infiniop/ops/quant/per_tensor_quant_int8/operator.cc
@@ -0,0 +1,103 @@
+#include "../../../operator.h"
+#include "../../../handle.h"
+#include "infiniop/ops/quant/per_tensor_quant_int8.h"
+
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/per_tensor_quant_int8_nvidia.cuh"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreatePerTensorQuantI8Descriptor(infiniopHandle_t handle,
+                                                                   infiniopPerTensorQuantI8Descriptor_t *desc_ptr,
+                                                                   infiniopTensorDescriptor_t x_packed_desc,
+                                                                   infiniopTensorDescriptor_t x_scale_desc,
+                                                                   infiniopTensorDescriptor_t x_zero_desc,
+                                                                   infiniopTensorDescriptor_t x_desc) {
+#define CREATE(CASE, NAMESPACE)                                                              \
+    case CASE:                                                                               \
+        return op::per_tensor_quant_int8::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                          \
+            reinterpret_cast<op::per_tensor_quant_int8::NAMESPACE::Descriptor **>(desc_ptr), \
+            x_packed_desc,                                                                   \
+            x_scale_desc,                                                                    \
+            x_zero_desc,                                                                     \
+            x_desc);
+    switch (handle->device) {
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia)
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetPerTensorQuantI8WorkspaceSize(infiniopPerTensorQuantI8Descriptor_t desc, size_t *size) {
+    switch (desc->device_type) {
+#define GET(CASE, NAMESPACE)                                                                                    \
+    case CASE:                                                                                                  \
+        *size = reinterpret_cast<op::per_tensor_quant_int8::NAMESPACE::Descriptor *>(desc)->minWorkspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__INFINI_C infiniStatus_t infiniopPerTensorQuantI8(infiniopPerTensorQuantI8Descriptor_t desc,
+                                                   void *workspace,
+                                                   size_t workspace_size,
+                                                   void *x_packed,
+                                                   void *x_scale,
+                                                   void *x_zero,
+                                                   const void *x,
+                                                   const bool is_static,
+                                                   void *stream) {
+#define QUANT(CASE, NAMESPACE)                                                                        \
+    case CASE:                                                                                        \
+        return reinterpret_cast<op::per_tensor_quant_int8::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, x_packed, x_scale, x_zero, x, is_static, stream);
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        QUANT(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        QUANT(INFINI_DEVICE_QY, nvidia)
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef QUANT
+}
+
+__INFINI_C infiniStatus_t infiniopDestroyPerTensorQuantI8Descriptor(infiniopPerTensorQuantI8Descriptor_t desc) {
+#define DESTROY(CASE, NAMESPACE)                                                           \
+    case CASE:                                                                             \
+        delete reinterpret_cast<op::per_tensor_quant_int8::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        DESTROY(INFINI_DEVICE_QY, nvidia)
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DESTROY
+}
diff --git a/src/infiniop/ops/quant/per_tensor_quant_int8/per_tensor_quant_int8.h b/src/infiniop/ops/quant/per_tensor_quant_int8/per_tensor_quant_int8.h
new file mode 100644
index 000000000..f75b91173
--- /dev/null
+++ b/src/infiniop/ops/quant/per_tensor_quant_int8/per_tensor_quant_int8.h
@@ -0,0 +1,40 @@
+#ifndef __PER_TENSOR_QUANT_I8_H__
+#define __PER_TENSOR_QUANT_I8_H__
+
+#include "../../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                                                                      \
+                                                                                                                   \
+    namespace op::per_tensor_quant_int8::NAMESPACE {                                                               \
+    class Descriptor final : public InfiniopDescriptor {                                                           \
+        struct Opaque;                                                                                             \
+        Opaque *_opaque;                                                                                           \
+        PerTensorQuantI8Info _info;                                                                                \
+        size_t _workspace_size;                                                                                    \
+                                                                                                                   \
+        Descriptor(Opaque *opaque, PerTensorQuantI8Info info,                                                      \
+                   size_t workspace_size,                                                                          \
+                   infiniDevice_t device_type, int device_id)                                                      \
+            : InfiniopDescriptor{device_type, device_id},                                                          \
+              _opaque(opaque), _info(info), _workspace_size(workspace_size) {}                                     \
+                                                                                                                   \
+    public:                                                                                                        \
+        ~Descriptor();                                                                                             \
+                                                                                                                   \
+        size_t minWorkspaceSize() const { return _workspace_size; }                                                \
+                                                                                                                   \
+        static infiniStatus_t create(                                                                              \
+            infiniopHandle_t handle, Descriptor **desc_ptr,                                                        \
+            infiniopTensorDescriptor_t x_packed_desc,                                                              \
+            infiniopTensorDescriptor_t x_scale_desc,                                                               \
+            infiniopTensorDescriptor_t x_zero_desc,                                                                \
+            infiniopTensorDescriptor_t x_desc);                                                                    \
+                                                                                                                   \
+        infiniStatus_t calculate(                                                                                  \
+            void *workspace, size_t workspace_size,                                                                \
+            void *x_packed, void *x_scale, void *x_zero, const void *x, const bool is_static, void *stream) const; \
+    };                                                                                                             \
+    }
+
+#endif // __PER_TENSOR_QUANT_I8_H__
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index 275689e78..015daeca4 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -4,7 +4,7 @@
     infiniopOperatorDescriptor_t,
 )
 
-from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float
+from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float, c_bool
 
 
 class OpRegister:
@@ -760,6 +760,79 @@ def per_channel_quant_int8_(lib):
         infiniopOperatorDescriptor_t,
     ]
 
+
+@OpRegister.operator
+def per_tensor_quant_int8_(lib):
+    lib.infiniopCreatePerTensorQuantI8Descriptor.restype = c_int32
+    lib.infiniopCreatePerTensorQuantI8Descriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetPerTensorQuantI8WorkspaceSize.restype = c_int32
+    lib.infiniopGetPerTensorQuantI8WorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopPerTensorQuantI8.restype = c_int32
+    lib.infiniopPerTensorQuantI8.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_bool,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyPerTensorQuantI8Descriptor.restype = c_int32
+    lib.infiniopDestroyPerTensorQuantI8Descriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def per_tensor_dequant_int8_(lib):
+    lib.infiniopCreatePerTensorDequantI8Descriptor.restype = c_int32
+    lib.infiniopCreatePerTensorDequantI8Descriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetPerTensorDequantI8WorkspaceSize.restype = c_int32
+    lib.infiniopGetPerTensorDequantI8WorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopPerTensorDequantI8.restype = c_int32
+    lib.infiniopPerTensorDequantI8.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyPerTensorDequantI8Descriptor.restype = c_int32
+    lib.infiniopDestroyPerTensorDequantI8Descriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
 @OpRegister.operator
 def softplus_(lib):
     lib.infiniopCreateSoftplusDescriptor.restype = c_int32
diff --git a/test/infiniop/per_channel_quant_int8.py b/test/infiniop/per_channel_quant_int8.py
index dcbf9d1f8..f175419d9 100644
--- a/test/infiniop/per_channel_quant_int8.py
+++ b/test/infiniop/per_channel_quant_int8.py
@@ -24,7 +24,7 @@
 # ==============================================================================
 # These are not meant to be imported from other modules
 _TEST_CASES = [
-    # x_shape, w_shape, symmetric, bias_exit, y_shape
+    # x_shape, symmetric
     ((8, 8), True),
     ((128, 512), True),
     ((128, 128), True),
@@ -151,16 +151,16 @@ def lib_per_channel_quant_int8():
 
     atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
     if DEBUG:
-        debug(x_packed.actual_tensor(), x_p, atol=atol, rtol=rtol)
+        debug(x_packed.actual_tensor(), x_p, atol=2, rtol=0)
         debug(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol)
         if symmetric == False:
             debug(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol)
-    
+
     if symmetric:
-        assert (torch.allclose(x_packed.actual_tensor(), x_p, atol=2, rtol=2) and 
+        assert (torch.allclose(x_packed.actual_tensor(), x_p, atol=2, rtol=0) and 
                 torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol))
     else:
-        assert (torch.allclose(x_packed.actual_tensor(), x_p, atol=2, rtol=2) and 
+        assert (torch.allclose(x_packed.actual_tensor(), x_p, atol=2, rtol=0) and 
                 torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol) and
                 torch.allclose(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol))
 
diff --git a/test/infiniop/per_tensor_dequant_int8.py b/test/infiniop/per_tensor_dequant_int8.py
new file mode 100644
index 000000000..251bc25a1
--- /dev/null
+++ b/test/infiniop/per_tensor_dequant_int8.py
@@ -0,0 +1,175 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES = [
+    # x_shape, x_stride, x_packed_stride, symmetric
+    ((16, 5632), None, None, True),
+    ((13, 4), (10, 1), None, True),
+    ((13, 4), (10, 1), (10, 1), True),
+    ((16, 5632), (13312, 1), (13312, 1), True),
+    ((4, 4, 5632), None, None, True),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), True),
+    ((1, 1, 8, 1), None, None, True),
+    ((1, 8, 32, 32), None, None, True),
+    ((8, 16, 64, 128), (8388608, 524288, 8192, 1), None, True),
+    ((1, 2, 2304, 128), (589824, 294912, 128, 1), (589824, 294912, 128, 1), True),
+]
+
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 5e-2},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 5e-2},
+    InfiniDtype.F32: {"atol": 3e-5, "rtol": 5e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def per_tensor_dequant_int8_torch(x_packed, x_scale, dtype):
+    fake_qweight = x_packed.to(dtype)
+    dq_weight = fake_qweight * x_scale
+    return dq_weight
+
+
+def test(
+    handle,
+    device,
+    x_shape,
+    x_stride,
+    x_packed_stride,
+    symmetric,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    if symmetric == False:
+        return
+    print(
+        f"Testing Per Tensor Dequant Int8 on {InfiniDeviceNames[device]} with x_shape:{x_shape}, x_stride:{x_stride}, x_packed_stride:{x_packed_stride}, symmetric:{symmetric} , dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    x = TestTensor(x_shape, x_stride, dtype, device)
+
+    x_packed = TestTensor(
+        x_shape,
+        x_packed_stride,
+        InfiniDtype.I8,
+        device,
+        randint_low=-127,
+        randint_high=127,
+    )
+    x_scale = TestTensor((1,), None, InfiniDtype.F32, device)
+    if symmetric:
+        x_zero = None
+    else:
+        x_zero = TestTensor((1,), None, InfiniDtype.F32, device)
+
+    ans = per_tensor_dequant_int8_torch(
+        x_packed.torch_tensor(), x_scale.torch_tensor(), x.torch_tensor().dtype
+    )
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreatePerTensorDequantI8Descriptor(
+            handle,
+            ctypes.byref(descriptor),
+            x.descriptor,
+            x_packed.descriptor,
+            x_scale.descriptor,
+            None if symmetric else x_zero.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+
+    x_packed.destroy_desc()
+    x_scale.destroy_desc()
+    if symmetric == False:
+        x_zero.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetPerTensorDequantI8WorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_per_tensor_dequant_int8():
+        check_error(
+            LIBINFINIOP.infiniopPerTensorDequantI8(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                x.data(),
+                x_packed.data(),
+                x_scale.data(),
+                None if symmetric else x_zero.data(),
+                None,
+            )
+        )
+
+    lib_per_tensor_dequant_int8()
+
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(x.actual_tensor().float(), ans.float(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(x.actual_tensor().float(), ans.float(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: per_tensor_dequant_int8_torch(x_packed.torch_tensor(), x_scale.torch_tensor(), x.torch_tensor().dtype), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_per_tensor_dequant_int8(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyPerTensorDequantI8Descriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/per_tensor_quant_int8.py b/test/infiniop/per_tensor_quant_int8.py
new file mode 100644
index 000000000..fba4e152d
--- /dev/null
+++ b/test/infiniop/per_tensor_quant_int8.py
@@ -0,0 +1,201 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES = [
+    # x_shape, x_stride, x_packed_stride, symmetric, is_static
+    ((16, 5632), None, None, True, False),
+    ((13, 4), (10, 1), None, True, True),
+    ((13, 4), (10, 1), (10, 1), True, False),
+    ((16, 5632), (13312, 1), (13312, 1), True, True),
+    ((4, 4, 5632), None, None, True, False),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), True, True),
+    ((1, 1, 8, 1), None, None, True, False),
+    ((1, 8, 32, 32), None, None, True, True),
+    ((8, 16, 64, 128), (8388608, 524288, 8192, 1), None, True, False),
+    ((1, 2, 2304, 128), (589824, 294912, 128, 1), (589824, 294912, 128, 1), True, True),
+]
+
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 5e-2},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 5e-2},
+    InfiniDtype.F32: {"atol": 3e-5, "rtol": 5e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def per_tensor_quant_int8_torch(x, x_scale, symmetric, is_static):
+    if symmetric == False:
+        return
+    else:
+        x = x.float()
+        if is_static:
+            x_q = x.mul(1 / x_scale)
+            x_q = torch.round(x_q).to(torch.int8)
+            return x_q, x_scale, None
+        else:
+            absmax = x.flatten().abs().max()
+            if absmax == 0:
+                scale = torch.tensor(1.0, device=x.device, dtype=torch.float32)
+                q = torch.zeros_like(x, dtype=torch.int8)
+                return q, scale, None
+        scale = absmax / 127
+        x_q = x.mul(127 / absmax)
+        x_q = torch.round(x_q).to(torch.int8)
+
+        return x_q, scale, None
+
+
+def test(
+    handle,
+    device,
+    x_shape,
+    x_stride,
+    x_packed_stride,
+    symmetric,
+    is_static,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+
+    print(
+        f"Testing Per Tensor Quant Int8 on {InfiniDeviceNames[device]} with x_shape:{x_shape}, x_stride:{x_stride}, x_packed_stride:{x_packed_stride}, symmetric:{symmetric}, is_static:{is_static}, dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    x = TestTensor(x_shape, x_stride, dtype, device)
+    x_packed = TestTensor(
+        x_shape, x_packed_stride, InfiniDtype.I8, device, mode="zeros"
+    )
+    if is_static == False:
+        x_scale = TestTensor((1,), None, InfiniDtype.F32, device, mode="zeros")
+    else:
+        x_scale = TestTensor((1,), None, InfiniDtype.F32, device)
+    if symmetric:
+        x_zero = None
+    else:
+        x_zero = TestTensor((1,), None, InfiniDtype.F32, device)
+    if sync is not None:
+        sync()
+
+    x_p, x_s, x_z = per_tensor_quant_int8_torch(
+        x.torch_tensor(), x_scale.torch_tensor(), symmetric, is_static
+    )
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreatePerTensorQuantI8Descriptor(
+            handle,
+            ctypes.byref(descriptor),
+            x_packed.descriptor,
+            x_scale.descriptor,
+            None if symmetric else x_zero.descriptor,
+            x.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+
+    x_packed.destroy_desc()
+    x_scale.destroy_desc()
+    if symmetric == False:
+        x_zero.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetPerTensorQuantI8WorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_per_tensor_quant_int8():
+        check_error(
+            LIBINFINIOP.infiniopPerTensorQuantI8(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                x_packed.data(),
+                x_scale.data(),
+                None if symmetric else x_zero.data(),
+                x.data(),
+                is_static,
+                None,
+            )
+        )
+
+    lib_per_tensor_quant_int8()
+    
+    if sync is not None:
+        sync()
+    
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(x_packed.actual_tensor(), x_p, atol=2, rtol=0)
+        debug(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol)
+        if symmetric == False:
+            debug(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol)
+    
+    if symmetric:
+        assert torch.allclose(
+            x_packed.actual_tensor(), x_p, atol=2, rtol=0
+        ) and torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol)
+    else:
+        assert (
+            torch.allclose(x_packed.actual_tensor(), x_p, atol=2, rtol=0)
+            and torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol)
+            and torch.allclose(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol)
+        )
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: per_tensor_quant_int8_torch(x.torch_tensor(), x_scale.torch_tensor(), symmetric, is_static), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_per_tensor_quant_int8(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyPerTensorQuantI8Descriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")